forked from Minki/linux
Scheduler updates:
- migrate_disable/enable() support which originates from the RT tree and is now a prerequisite for the new preemptible kmap_local() API which aims to replace kmap_atomic(). - A fair amount of topology and NUMA related improvements - Improvements for the frequency invariant calculations - Enhanced robustness for the global CPU priority tracking and decision making - The usual small fixes and enhancements all over the place -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl/XwK4THHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYoX28D/9cVrvziSQGfBfuQWnUiw8iOIq1QBa2 Me+Tvenhfrlt7xU6rbP9ciFu7eTN+fS06m5uQPGI+t22WuJmHzbmw1bJVXfkvYfI /QoU+Hg7DkDAn1p7ZKXh0dRkV0nI9ixxSHl0E+Zf1ATBxCUMV2SO85flg6z/4qJq 3VWUye0dmR7/bhtkIjv5rwce9v2JB2g1AbgYXYTW9lHVoUdGoMSdiZAF4tGyHLnx sJ6DMqQ+k+dmPyYO0z5MTzjW/fXit4n9w2e3z9TvRH/uBu58WSW1RBmQYX6aHBAg dhT9F4lvTs6lJY23x5RSFWDOv6xAvKF5a0xfb8UZcyH5EoLYrPRvm42a0BbjdeRa u0z7LbwIlKA+RFdZzFZWz8UvvO0ljyMjmiuqZnZ5dY9Cd80LSBuxrWeQYG0qg6lR Y2povhhCepEG+q8AXIe2YjHKWKKC1s/l/VY3CNnCzcd21JPQjQ4Z5eWGmHif5IED CntaeFFhZadR3w02tkX35zFmY3w4soKKrbI4EKWrQwd+cIEQlOSY7dEPI/b5BbYj MWAb3P4EG9N77AWTNmbhK4nN0brEYb+rBbCA+5dtNBVhHTxAC7OTWElJOC2O66FI e06dREjvwYtOkRUkUguWwErbIai2gJ2MH0VILV3hHoh64oRk7jjM8PZYnjQkdptQ Gsq0rJW5iiu/OQ== =Oz1V -----END PGP SIGNATURE----- Merge tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Thomas Gleixner: - migrate_disable/enable() support which originates from the RT tree and is now a prerequisite for the new preemptible kmap_local() API which aims to replace kmap_atomic(). - A fair amount of topology and NUMA related improvements - Improvements for the frequency invariant calculations - Enhanced robustness for the global CPU priority tracking and decision making - The usual small fixes and enhancements all over the place * tag 'sched-core-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits) sched/fair: Trivial correction of the newidle_balance() comment sched/fair: Clear SMT siblings after determining the core is not idle sched: Fix kernel-doc markup x86: Print ratio freq_max/freq_base used in frequency invariance calculations x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC x86, sched: Calculate frequency invariance for AMD systems irq_work: Optimize irq_work_single() smp: Cleanup smp_call_function*() irq_work: Cleanup sched: Limit the amount of NUMA imbalance that can exist at fork time sched/numa: Allow a floating imbalance between NUMA nodes sched: Avoid unnecessary calculation of load imbalance at clone time sched/numa: Rename nr_running and break out the magic number sched: Make migrate_disable/enable() independent of RT sched/topology: Condition EAS enablement on FIE support arm64: Rebuild sched domains on invariance status changes sched/topology,schedutil: Wrap sched domains rebuild sched/uclamp: Allow to reset a task uclamp constraint value sched/core: Fix typos in comments Documentation: scheduler: fix information on arch SD flags, sched_domain and sched_debug ...
This commit is contained in:
commit
adb35e8dc9
@ -65,21 +65,17 @@ of the SMP domain will span the entire machine, with each group having the
|
||||
cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example,
|
||||
might have just one domain covering its one NUMA level.
|
||||
|
||||
The implementor should read comments in include/linux/sched.h:
|
||||
struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
|
||||
the specifics and what to tune.
|
||||
The implementor should read comments in include/linux/sched/sd_flags.h:
|
||||
SD_* to get an idea of the specifics and what to tune for the SD flags
|
||||
of a sched_domain.
|
||||
|
||||
Architectures may retain the regular override the default SD_*_INIT flags
|
||||
while using the generic domain builder in kernel/sched/core.c if they wish to
|
||||
retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
|
||||
can be done by #define'ing ARCH_HASH_SCHED_TUNE.
|
||||
|
||||
Alternatively, the architecture may completely override the generic domain
|
||||
builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
|
||||
arch_init_sched_domains function. This function will attach domains to all
|
||||
CPUs using cpu_attach_domain.
|
||||
Architectures may override the generic domain builder and the default SD flags
|
||||
for a given topology level by creating a sched_domain_topology_level array and
|
||||
calling set_sched_topology() with this array as the parameter.
|
||||
|
||||
The sched-domains debugging infrastructure can be enabled by enabling
|
||||
CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
|
||||
which should catch most possible errors (described above). It also prints out
|
||||
the domain structure in a visual format.
|
||||
CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
|
||||
tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
|
||||
knob. This enables an error checking parse of the sched domains which should
|
||||
catch most possible errors (described above). It also prints out the domain
|
||||
structure in a visual format.
|
||||
|
@ -223,6 +223,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
|
||||
|
||||
static int __init init_amu_fie(void)
|
||||
{
|
||||
bool invariance_status = topology_scale_freq_invariant();
|
||||
cpumask_var_t valid_cpus;
|
||||
bool have_policy = false;
|
||||
int ret = 0;
|
||||
@ -269,6 +270,15 @@ static int __init init_amu_fie(void)
|
||||
if (!topology_scale_freq_invariant())
|
||||
static_branch_disable(&amu_fie_key);
|
||||
|
||||
/*
|
||||
* Task scheduler behavior depends on frequency invariance support,
|
||||
* either cpufreq or counter driven. If the support status changes as
|
||||
* a result of counter initialisation and use, retrigger the build of
|
||||
* scheduling domains to ensure the information is propagated properly.
|
||||
*/
|
||||
if (invariance_status != topology_scale_freq_invariant())
|
||||
rebuild_sched_domains_energy();
|
||||
|
||||
free_valid_mask:
|
||||
free_cpumask_var(valid_cpus);
|
||||
|
||||
|
@ -702,7 +702,6 @@ unsigned long arch_align_stack(unsigned long sp)
|
||||
return sp & ALMASK;
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(call_single_data_t, backtrace_csd);
|
||||
static struct cpumask backtrace_csd_busy;
|
||||
|
||||
static void handle_backtrace(void *info)
|
||||
@ -711,6 +710,9 @@ static void handle_backtrace(void *info)
|
||||
cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(call_single_data_t, backtrace_csd) =
|
||||
CSD_INIT(handle_backtrace, NULL);
|
||||
|
||||
static void raise_backtrace(cpumask_t *mask)
|
||||
{
|
||||
call_single_data_t *csd;
|
||||
@ -730,7 +732,6 @@ static void raise_backtrace(cpumask_t *mask)
|
||||
}
|
||||
|
||||
csd = &per_cpu(backtrace_csd, cpu);
|
||||
csd->func = handle_backtrace;
|
||||
smp_call_function_single_async(cpu, csd);
|
||||
}
|
||||
}
|
||||
|
@ -687,7 +687,13 @@ EXPORT_SYMBOL(flush_tlb_one);
|
||||
|
||||
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
|
||||
|
||||
static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
|
||||
static void tick_broadcast_callee(void *info)
|
||||
{
|
||||
tick_receive_broadcast();
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd) =
|
||||
CSD_INIT(tick_broadcast_callee, NULL);
|
||||
|
||||
void tick_broadcast(const struct cpumask *mask)
|
||||
{
|
||||
@ -700,23 +706,4 @@ void tick_broadcast(const struct cpumask *mask)
|
||||
}
|
||||
}
|
||||
|
||||
static void tick_broadcast_callee(void *info)
|
||||
{
|
||||
tick_receive_broadcast();
|
||||
}
|
||||
|
||||
static int __init tick_broadcast_init(void)
|
||||
{
|
||||
call_single_data_t *csd;
|
||||
int cpu;
|
||||
|
||||
for (cpu = 0; cpu < NR_CPUS; cpu++) {
|
||||
csd = &per_cpu(tick_broadcast_csd, cpu);
|
||||
csd->func = tick_broadcast_callee;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
early_initcall(tick_broadcast_init);
|
||||
|
||||
#endif /* CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
|
||||
|
@ -179,9 +179,7 @@ static void zpci_handle_fallback_irq(void)
|
||||
if (atomic_inc_return(&cpu_data->scheduled) > 1)
|
||||
continue;
|
||||
|
||||
cpu_data->csd.func = zpci_handle_remote_irq;
|
||||
cpu_data->csd.info = &cpu_data->scheduled;
|
||||
cpu_data->csd.flags = 0;
|
||||
INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled);
|
||||
smp_call_function_single_async(cpu, &cpu_data->csd);
|
||||
}
|
||||
}
|
||||
|
@ -218,4 +218,9 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ACPI_CPPC_LIB
|
||||
void init_freq_invariance_cppc(void);
|
||||
#define init_freq_invariance_cppc init_freq_invariance_cppc
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_TOPOLOGY_H */
|
||||
|
@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
|
||||
|
||||
init_completion(&cmd.done);
|
||||
for (; count; count -= 16) {
|
||||
call_single_data_t csd = {
|
||||
.func = cpuid_smp_cpuid,
|
||||
.info = &cmd,
|
||||
};
|
||||
call_single_data_t csd;
|
||||
|
||||
INIT_CSD(&csd, cpuid_smp_cpuid, &cmd);
|
||||
|
||||
cmd.regs.eax = pos;
|
||||
cmd.regs.ecx = pos >> 32;
|
||||
|
@ -82,6 +82,10 @@
|
||||
#include <asm/hw_irq.h>
|
||||
#include <asm/stackprotector.h>
|
||||
|
||||
#ifdef CONFIG_ACPI_CPPC_LIB
|
||||
#include <acpi/cppc_acpi.h>
|
||||
#endif
|
||||
|
||||
/* representing HT siblings of each logical CPU */
|
||||
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
|
||||
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
|
||||
@ -148,7 +152,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
|
||||
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
|
||||
}
|
||||
|
||||
static void init_freq_invariance(bool secondary);
|
||||
static void init_freq_invariance(bool secondary, bool cppc_ready);
|
||||
|
||||
/*
|
||||
* Report back to the Boot Processor during boot time or to the caller processor
|
||||
@ -186,7 +190,7 @@ static void smp_callin(void)
|
||||
*/
|
||||
set_cpu_sibling_map(raw_smp_processor_id());
|
||||
|
||||
init_freq_invariance(true);
|
||||
init_freq_invariance(true, false);
|
||||
|
||||
/*
|
||||
* Get our bogomips.
|
||||
@ -1341,7 +1345,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
|
||||
set_sched_topology(x86_topology);
|
||||
|
||||
set_cpu_sibling_map(0);
|
||||
init_freq_invariance(false);
|
||||
init_freq_invariance(false, false);
|
||||
smp_sanity_check();
|
||||
|
||||
switch (apic_intr_mode) {
|
||||
@ -2028,6 +2032,48 @@ out:
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ACPI_CPPC_LIB
|
||||
static bool amd_set_max_freq_ratio(void)
|
||||
{
|
||||
struct cppc_perf_caps perf_caps;
|
||||
u64 highest_perf, nominal_perf;
|
||||
u64 perf_ratio;
|
||||
int rc;
|
||||
|
||||
rc = cppc_get_perf_caps(0, &perf_caps);
|
||||
if (rc) {
|
||||
pr_debug("Could not retrieve perf counters (%d)\n", rc);
|
||||
return false;
|
||||
}
|
||||
|
||||
highest_perf = perf_caps.highest_perf;
|
||||
nominal_perf = perf_caps.nominal_perf;
|
||||
|
||||
if (!highest_perf || !nominal_perf) {
|
||||
pr_debug("Could not retrieve highest or nominal performance\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
|
||||
/* midpoint between max_boost and max_P */
|
||||
perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
|
||||
if (!perf_ratio) {
|
||||
pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
arch_turbo_freq_ratio = perf_ratio;
|
||||
arch_set_max_freq_ratio(false);
|
||||
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
static bool amd_set_max_freq_ratio(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void init_counter_refs(void)
|
||||
{
|
||||
u64 aperf, mperf;
|
||||
@ -2039,7 +2085,7 @@ static void init_counter_refs(void)
|
||||
this_cpu_write(arch_prev_mperf, mperf);
|
||||
}
|
||||
|
||||
static void init_freq_invariance(bool secondary)
|
||||
static void init_freq_invariance(bool secondary, bool cppc_ready)
|
||||
{
|
||||
bool ret = false;
|
||||
|
||||
@ -2055,15 +2101,38 @@ static void init_freq_invariance(bool secondary)
|
||||
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
|
||||
ret = intel_set_max_freq_ratio();
|
||||
else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
|
||||
if (!cppc_ready) {
|
||||
return;
|
||||
}
|
||||
ret = amd_set_max_freq_ratio();
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
init_counter_refs();
|
||||
static_branch_enable(&arch_scale_freq_key);
|
||||
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
|
||||
} else {
|
||||
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ACPI_CPPC_LIB
|
||||
static DEFINE_MUTEX(freq_invariance_lock);
|
||||
|
||||
void init_freq_invariance_cppc(void)
|
||||
{
|
||||
static bool secondary;
|
||||
|
||||
mutex_lock(&freq_invariance_lock);
|
||||
|
||||
init_freq_invariance(secondary, true);
|
||||
secondary = true;
|
||||
|
||||
mutex_unlock(&freq_invariance_lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void disable_freq_invariance_workfn(struct work_struct *work)
|
||||
{
|
||||
static_branch_disable(&arch_scale_freq_key);
|
||||
@ -2113,7 +2182,7 @@ error:
|
||||
schedule_work(&disable_freq_invariance_work);
|
||||
}
|
||||
#else
|
||||
static inline void init_freq_invariance(bool secondary)
|
||||
static inline void init_freq_invariance(bool secondary, bool cppc_ready)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
@ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *info)
|
||||
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
|
||||
{
|
||||
struct msr_info_completion rv;
|
||||
call_single_data_t csd = {
|
||||
.func = __rdmsr_safe_on_cpu,
|
||||
.info = &rv,
|
||||
};
|
||||
call_single_data_t csd;
|
||||
int err;
|
||||
|
||||
INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv);
|
||||
|
||||
memset(&rv, 0, sizeof(rv));
|
||||
init_completion(&rv.done);
|
||||
rv.msr.msr_no = msr_no;
|
||||
|
@ -671,9 +671,7 @@ bool blk_mq_complete_request_remote(struct request *rq)
|
||||
return false;
|
||||
|
||||
if (blk_mq_complete_need_ipi(rq)) {
|
||||
rq->csd.func = __blk_mq_complete_request_remote;
|
||||
rq->csd.info = rq;
|
||||
rq->csd.flags = 0;
|
||||
INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
|
||||
smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
|
||||
} else {
|
||||
if (rq->q->nr_hw_queues > 1)
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/topology.h>
|
||||
|
||||
#include <acpi/cppc_acpi.h>
|
||||
|
||||
@ -688,6 +689,10 @@ static bool is_cppc_supported(int revision, int num_ent)
|
||||
* }
|
||||
*/
|
||||
|
||||
#ifndef init_freq_invariance_cppc
|
||||
static inline void init_freq_invariance_cppc(void) { }
|
||||
#endif
|
||||
|
||||
/**
|
||||
* acpi_cppc_processor_probe - Search for per CPU _CPC objects.
|
||||
* @pr: Ptr to acpi_processor containing this CPU's logical ID.
|
||||
@ -850,6 +855,8 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
init_freq_invariance_cppc();
|
||||
|
||||
kfree(output.pointer);
|
||||
return 0;
|
||||
|
||||
|
@ -674,8 +674,7 @@ have_coupled:
|
||||
coupled->refcnt++;
|
||||
|
||||
csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);
|
||||
csd->func = cpuidle_coupled_handle_poke;
|
||||
csd->info = (void *)(unsigned long)dev->cpu;
|
||||
INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -197,7 +197,7 @@ __notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk))
|
||||
|
||||
llist_for_each_entry_safe(cb, cn,
|
||||
llist_del_all(&rq->execute_cb),
|
||||
work.llnode)
|
||||
work.node.llist)
|
||||
fn(&cb->work);
|
||||
}
|
||||
|
||||
@ -460,7 +460,7 @@ __await_execution(struct i915_request *rq,
|
||||
* callback first, then checking the ACTIVE bit, we serialise with
|
||||
* the completed/retired request.
|
||||
*/
|
||||
if (llist_add(&cb->work.llnode, &signal->execute_cb)) {
|
||||
if (llist_add(&cb->work.node.llist, &signal->execute_cb)) {
|
||||
if (i915_request_is_active(signal) ||
|
||||
__request_in_flight(signal))
|
||||
__notify_execute_cb_imm(signal);
|
||||
|
@ -729,13 +729,8 @@ static void liquidio_napi_drv_callback(void *arg)
|
||||
droq->cpu_id == this_cpu) {
|
||||
napi_schedule_irqoff(&droq->napi);
|
||||
} else {
|
||||
call_single_data_t *csd = &droq->csd;
|
||||
|
||||
csd->func = napi_schedule_wrapper;
|
||||
csd->info = &droq->napi;
|
||||
csd->flags = 0;
|
||||
|
||||
smp_call_function_single_async(droq->cpu_id, csd);
|
||||
INIT_CSD(&droq->csd, napi_schedule_wrapper, &droq->napi);
|
||||
smp_call_function_single_async(droq->cpu_id, &droq->csd);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -383,9 +383,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
|
||||
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
|
||||
{
|
||||
seq_printf(m, "Cpus_allowed:\t%*pb\n",
|
||||
cpumask_pr_args(task->cpus_ptr));
|
||||
cpumask_pr_args(&task->cpus_mask));
|
||||
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
|
||||
cpumask_pr_args(task->cpus_ptr));
|
||||
cpumask_pr_args(&task->cpus_mask));
|
||||
}
|
||||
|
||||
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
|
||||
|
@ -152,6 +152,7 @@ enum cpuhp_state {
|
||||
CPUHP_AP_ONLINE,
|
||||
CPUHP_TEARDOWN_CPU,
|
||||
CPUHP_AP_ONLINE_IDLE,
|
||||
CPUHP_AP_SCHED_WAIT_EMPTY,
|
||||
CPUHP_AP_SMPBOOT_THREADS,
|
||||
CPUHP_AP_X86_VDSO_VMA_ONLINE,
|
||||
CPUHP_AP_IRQ_AFFINITY_ONLINE,
|
||||
|
@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
|
||||
return cpumask_next_and(-1, src1p, src2p);
|
||||
}
|
||||
|
||||
static inline int cpumask_any_distribute(const struct cpumask *srcp)
|
||||
{
|
||||
return cpumask_first(srcp);
|
||||
}
|
||||
|
||||
#define for_each_cpu(cpu, mask) \
|
||||
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
|
||||
#define for_each_cpu_not(cpu, mask) \
|
||||
@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
|
||||
unsigned int cpumask_local_spread(unsigned int i, int node);
|
||||
int cpumask_any_and_distribute(const struct cpumask *src1p,
|
||||
const struct cpumask *src2p);
|
||||
int cpumask_any_distribute(const struct cpumask *srcp);
|
||||
|
||||
/**
|
||||
* for_each_cpu - iterate over every cpu in a mask
|
||||
|
@ -14,28 +14,37 @@
|
||||
*/
|
||||
|
||||
struct irq_work {
|
||||
union {
|
||||
struct __call_single_node node;
|
||||
struct {
|
||||
struct llist_node llnode;
|
||||
atomic_t flags;
|
||||
};
|
||||
};
|
||||
void (*func)(struct irq_work *);
|
||||
};
|
||||
|
||||
#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \
|
||||
.node = { .u_flags = (_flags), }, \
|
||||
.func = (_func), \
|
||||
}
|
||||
|
||||
#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0)
|
||||
#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY)
|
||||
#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ)
|
||||
|
||||
#define DEFINE_IRQ_WORK(name, _f) \
|
||||
struct irq_work name = IRQ_WORK_INIT(_f)
|
||||
|
||||
static inline
|
||||
void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
|
||||
{
|
||||
atomic_set(&work->flags, 0);
|
||||
work->func = func;
|
||||
*work = IRQ_WORK_INIT(func);
|
||||
}
|
||||
|
||||
#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \
|
||||
.flags = ATOMIC_INIT(0), \
|
||||
.func = (_f) \
|
||||
static inline bool irq_work_is_pending(struct irq_work *work)
|
||||
{
|
||||
return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING;
|
||||
}
|
||||
|
||||
static inline bool irq_work_is_busy(struct irq_work *work)
|
||||
{
|
||||
return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
|
||||
}
|
||||
|
||||
bool irq_work_queue(struct irq_work *work);
|
||||
bool irq_work_queue_on(struct irq_work *work, int cpu);
|
||||
|
@ -107,14 +107,14 @@ do { \
|
||||
current->irq_config = 0; \
|
||||
} while (0)
|
||||
|
||||
# define lockdep_irq_work_enter(__work) \
|
||||
# define lockdep_irq_work_enter(_flags) \
|
||||
do { \
|
||||
if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
|
||||
if (!((_flags) & IRQ_WORK_HARD_IRQ)) \
|
||||
current->irq_config = 1; \
|
||||
} while (0)
|
||||
# define lockdep_irq_work_exit(__work) \
|
||||
# define lockdep_irq_work_exit(_flags) \
|
||||
do { \
|
||||
if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
|
||||
if (!((_flags) & IRQ_WORK_HARD_IRQ)) \
|
||||
current->irq_config = 0; \
|
||||
} while (0)
|
||||
|
||||
|
@ -204,6 +204,7 @@ extern int _cond_resched(void);
|
||||
extern void ___might_sleep(const char *file, int line, int preempt_offset);
|
||||
extern void __might_sleep(const char *file, int line, int preempt_offset);
|
||||
extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
||||
extern void __cant_migrate(const char *file, int line);
|
||||
|
||||
/**
|
||||
* might_sleep - annotation for functions that can sleep
|
||||
@ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
||||
# define cant_sleep() \
|
||||
do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
|
||||
# define sched_annotate_sleep() (current->task_state_change = 0)
|
||||
|
||||
/**
|
||||
* cant_migrate - annotation for functions that cannot migrate
|
||||
*
|
||||
* Will print a stack trace if executed in code which is migratable
|
||||
*/
|
||||
# define cant_migrate() \
|
||||
do { \
|
||||
if (IS_ENABLED(CONFIG_SMP)) \
|
||||
__cant_migrate(__FILE__, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* non_block_start - annotate the start of section where sleeping is prohibited
|
||||
*
|
||||
@ -251,6 +264,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
||||
int preempt_offset) { }
|
||||
# define might_sleep() do { might_resched(); } while (0)
|
||||
# define cant_sleep() do { } while (0)
|
||||
# define cant_migrate() do { } while (0)
|
||||
# define sched_annotate_sleep() do { } while (0)
|
||||
# define non_block_start() do { } while (0)
|
||||
# define non_block_end() do { } while (0)
|
||||
@ -258,13 +272,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
||||
|
||||
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
|
||||
|
||||
#ifndef CONFIG_PREEMPT_RT
|
||||
# define cant_migrate() cant_sleep()
|
||||
#else
|
||||
/* Placeholder for now */
|
||||
# define cant_migrate() do { } while (0)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* abs - return absolute value of an argument
|
||||
* @x: the value. If it is unsigned type, it is converted to signed type first.
|
||||
|
@ -322,34 +322,71 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* migrate_disable - Prevent migration of the current task
|
||||
*
|
||||
* Maps to preempt_disable() which also disables preemption. Use
|
||||
* migrate_disable() to annotate that the intent is to prevent migration,
|
||||
* but not necessarily preemption.
|
||||
*
|
||||
* Can be invoked nested like preempt_disable() and needs the corresponding
|
||||
* number of migrate_enable() invocations.
|
||||
*/
|
||||
static __always_inline void migrate_disable(void)
|
||||
{
|
||||
preempt_disable();
|
||||
}
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
/**
|
||||
* migrate_enable - Allow migration of the current task
|
||||
/*
|
||||
* Migrate-Disable and why it is undesired.
|
||||
*
|
||||
* Counterpart to migrate_disable().
|
||||
* When a preempted task becomes elegible to run under the ideal model (IOW it
|
||||
* becomes one of the M highest priority tasks), it might still have to wait
|
||||
* for the preemptee's migrate_disable() section to complete. Thereby suffering
|
||||
* a reduction in bandwidth in the exact duration of the migrate_disable()
|
||||
* section.
|
||||
*
|
||||
* As migrate_disable() can be invoked nested, only the outermost invocation
|
||||
* reenables migration.
|
||||
* Per this argument, the change from preempt_disable() to migrate_disable()
|
||||
* gets us:
|
||||
*
|
||||
* - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
|
||||
* it would have had to wait for the lower priority task.
|
||||
*
|
||||
* - a lower priority tasks; which under preempt_disable() could've instantly
|
||||
* migrated away when another CPU becomes available, is now constrained
|
||||
* by the ability to push the higher priority task away, which might itself be
|
||||
* in a migrate_disable() section, reducing it's available bandwidth.
|
||||
*
|
||||
* IOW it trades latency / moves the interference term, but it stays in the
|
||||
* system, and as long as it remains unbounded, the system is not fully
|
||||
* deterministic.
|
||||
*
|
||||
*
|
||||
* The reason we have it anyway.
|
||||
*
|
||||
* PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
|
||||
* number of primitives into becoming preemptible, they would also allow
|
||||
* migration. This turns out to break a bunch of per-cpu usage. To this end,
|
||||
* all these primitives employ migirate_disable() to restore this implicit
|
||||
* assumption.
|
||||
*
|
||||
* This is a 'temporary' work-around at best. The correct solution is getting
|
||||
* rid of the above assumptions and reworking the code to employ explicit
|
||||
* per-cpu locking or short preempt-disable regions.
|
||||
*
|
||||
* The end goal must be to get rid of migrate_disable(), alternatively we need
|
||||
* a schedulability theory that does not depend on abritrary migration.
|
||||
*
|
||||
*
|
||||
* Notes on the implementation.
|
||||
*
|
||||
* The implementation is particularly tricky since existing code patterns
|
||||
* dictate neither migrate_disable() nor migrate_enable() is allowed to block.
|
||||
* This means that it cannot use cpus_read_lock() to serialize against hotplug,
|
||||
* nor can it easily migrate itself into a pending affinity mask change on
|
||||
* migrate_enable().
|
||||
*
|
||||
*
|
||||
* Note: even non-work-conserving schedulers like semi-partitioned depends on
|
||||
* migration, so migrate_disable() is not only a problem for
|
||||
* work-conserving schedulers.
|
||||
*
|
||||
* Currently mapped to preempt_enable().
|
||||
*/
|
||||
static __always_inline void migrate_enable(void)
|
||||
{
|
||||
preempt_enable();
|
||||
}
|
||||
extern void migrate_disable(void);
|
||||
extern void migrate_enable(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline void migrate_disable(void) { }
|
||||
static inline void migrate_enable(void) { }
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#endif /* __LINUX_PREEMPT_H */
|
||||
|
@ -723,6 +723,11 @@ struct task_struct {
|
||||
int nr_cpus_allowed;
|
||||
const cpumask_t *cpus_ptr;
|
||||
cpumask_t cpus_mask;
|
||||
void *migration_pending;
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned short migration_disabled;
|
||||
#endif
|
||||
unsigned short migration_flags;
|
||||
|
||||
#ifdef CONFIG_PREEMPT_RCU
|
||||
int rcu_read_lock_nesting;
|
||||
|
@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
|
||||
extern int sched_cpu_deactivate(unsigned int cpu);
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
extern int sched_cpu_wait_empty(unsigned int cpu);
|
||||
extern int sched_cpu_dying(unsigned int cpu);
|
||||
#else
|
||||
# define sched_cpu_wait_empty NULL
|
||||
# define sched_cpu_dying NULL
|
||||
#endif
|
||||
|
||||
|
@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
|
||||
|
||||
extern void membarrier_exec_mmap(struct mm_struct *mm);
|
||||
|
||||
extern void membarrier_update_current_mm(struct mm_struct *next_mm);
|
||||
|
||||
#else
|
||||
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
|
||||
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
|
||||
@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
|
||||
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_SCHED_MM_H */
|
||||
|
@ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
|
||||
|
||||
#endif /* !CONFIG_SMP */
|
||||
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
extern void rebuild_sched_domains_energy(void);
|
||||
#else
|
||||
static inline void rebuild_sched_domains_energy(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_scale_cpu_capacity
|
||||
/**
|
||||
* arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.
|
||||
|
@ -21,24 +21,23 @@ typedef bool (*smp_cond_func_t)(int cpu, void *info);
|
||||
* structure shares (partial) layout with struct irq_work
|
||||
*/
|
||||
struct __call_single_data {
|
||||
union {
|
||||
struct __call_single_node node;
|
||||
struct {
|
||||
struct llist_node llist;
|
||||
unsigned int flags;
|
||||
#ifdef CONFIG_64BIT
|
||||
u16 src, dst;
|
||||
#endif
|
||||
};
|
||||
};
|
||||
smp_call_func_t func;
|
||||
void *info;
|
||||
};
|
||||
|
||||
#define CSD_INIT(_func, _info) \
|
||||
(struct __call_single_data){ .func = (_func), .info = (_info), }
|
||||
|
||||
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
|
||||
typedef struct __call_single_data call_single_data_t
|
||||
__aligned(sizeof(struct __call_single_data));
|
||||
|
||||
#define INIT_CSD(_csd, _func, _info) \
|
||||
do { \
|
||||
*(_csd) = CSD_INIT((_func), (_info)); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Enqueue a llist_node on the call_single_queue; be very careful, read
|
||||
* flush_smp_call_function_queue() in detail.
|
||||
|
@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
|
||||
struct cpu_stop_work {
|
||||
struct list_head list; /* cpu_stopper->works */
|
||||
cpu_stop_fn_t fn;
|
||||
unsigned long caller;
|
||||
void *arg;
|
||||
struct cpu_stop_done *done;
|
||||
};
|
||||
@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
|
||||
void stop_machine_unpark(int cpu);
|
||||
void stop_machine_yield(const struct cpumask *cpumask);
|
||||
|
||||
extern void print_stop_info(const char *log_lvl, struct task_struct *task);
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
#include <linux/workqueue.h>
|
||||
@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/*
|
||||
|
@ -96,6 +96,8 @@ struct sched_param {
|
||||
* on a CPU with a capacity big enough to fit the specified value.
|
||||
* A task with a max utilization value smaller than 1024 is more likely
|
||||
* scheduled on a CPU with no more capacity than the specified value.
|
||||
*
|
||||
* A task utilization boundary can be reset by setting the attribute to -1.
|
||||
*/
|
||||
struct sched_attr {
|
||||
__u32 size;
|
||||
|
@ -298,7 +298,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
|
||||
if (irqs_disabled()) {
|
||||
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
||||
work = this_cpu_ptr(&up_read_work);
|
||||
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
|
||||
if (irq_work_is_busy(&work->irq_work)) {
|
||||
/* cannot queue more up_read, fallback */
|
||||
irq_work_busy = true;
|
||||
}
|
||||
|
@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
||||
*/
|
||||
static void rebuild_sched_domains_locked(void)
|
||||
{
|
||||
struct cgroup_subsys_state *pos_css;
|
||||
struct sched_domain_attr *attr;
|
||||
cpumask_var_t *doms;
|
||||
struct cpuset *cs;
|
||||
int ndoms;
|
||||
|
||||
lockdep_assert_cpus_held();
|
||||
percpu_rwsem_assert_held(&cpuset_rwsem);
|
||||
|
||||
/*
|
||||
* We have raced with CPU hotplug. Don't do anything to avoid
|
||||
* If we have raced with CPU hotplug, return early to avoid
|
||||
* passing doms with offlined cpu to partition_sched_domains().
|
||||
* Anyways, hotplug work item will rebuild sched domains.
|
||||
* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
|
||||
*
|
||||
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
|
||||
* should be the same as the active CPUs, so checking only top_cpuset
|
||||
* is enough to detect racing CPU offlines.
|
||||
*/
|
||||
if (!top_cpuset.nr_subparts_cpus &&
|
||||
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
|
||||
return;
|
||||
|
||||
if (top_cpuset.nr_subparts_cpus &&
|
||||
!cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
|
||||
/*
|
||||
* With subpartition CPUs, however, the effective CPUs of a partition
|
||||
* root should be only a subset of the active CPUs. Since a CPU in any
|
||||
* partition root could be offlined, all must be checked.
|
||||
*/
|
||||
if (top_cpuset.nr_subparts_cpus) {
|
||||
rcu_read_lock();
|
||||
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
|
||||
if (!is_partition_root(cs)) {
|
||||
pos_css = css_rightmost_descendant(pos_css);
|
||||
continue;
|
||||
}
|
||||
if (!cpumask_subset(cs->effective_cpus,
|
||||
cpu_active_mask)) {
|
||||
rcu_read_unlock();
|
||||
return;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* Generate domain masks and attrs */
|
||||
ndoms = generate_sched_domains(&doms, &attr);
|
||||
|
@ -1606,7 +1606,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
|
||||
.name = "ap:online",
|
||||
},
|
||||
/*
|
||||
* Handled on controll processor until the plugged processor manages
|
||||
* Handled on control processor until the plugged processor manages
|
||||
* this itself.
|
||||
*/
|
||||
[CPUHP_TEARDOWN_CPU] = {
|
||||
@ -1615,6 +1615,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
|
||||
.teardown.single = takedown_cpu,
|
||||
.cant_stop = true,
|
||||
},
|
||||
|
||||
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
|
||||
.name = "sched:waitempty",
|
||||
.startup.single = NULL,
|
||||
.teardown.single = sched_cpu_wait_empty,
|
||||
},
|
||||
|
||||
/* Handle smpboot threads park/unpark */
|
||||
[CPUHP_AP_SMPBOOT_THREADS] = {
|
||||
.name = "smpboot/threads:online",
|
||||
|
@ -225,8 +225,6 @@ NOKPROBE_SYMBOL(kgdb_skipexception);
|
||||
* Default (weak) implementation for kgdb_roundup_cpus
|
||||
*/
|
||||
|
||||
static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
|
||||
|
||||
void __weak kgdb_call_nmi_hook(void *ignored)
|
||||
{
|
||||
/*
|
||||
@ -241,6 +239,9 @@ void __weak kgdb_call_nmi_hook(void *ignored)
|
||||
}
|
||||
NOKPROBE_SYMBOL(kgdb_call_nmi_hook);
|
||||
|
||||
static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) =
|
||||
CSD_INIT(kgdb_call_nmi_hook, NULL);
|
||||
|
||||
void __weak kgdb_roundup_cpus(void)
|
||||
{
|
||||
call_single_data_t *csd;
|
||||
@ -267,7 +268,6 @@ void __weak kgdb_roundup_cpus(void)
|
||||
continue;
|
||||
kgdb_info[cpu].rounding_up = true;
|
||||
|
||||
csd->func = kgdb_call_nmi_hook;
|
||||
ret = smp_call_function_single_async(cpu, csd);
|
||||
if (ret)
|
||||
kgdb_info[cpu].rounding_up = false;
|
||||
|
@ -478,10 +478,24 @@ static void exit_mm(void)
|
||||
BUG_ON(mm != current->active_mm);
|
||||
/* more a memory barrier than a real lock */
|
||||
task_lock(current);
|
||||
/*
|
||||
* When a thread stops operating on an address space, the loop
|
||||
* in membarrier_private_expedited() may not observe that
|
||||
* tsk->mm, and the loop in membarrier_global_expedited() may
|
||||
* not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
|
||||
* rq->membarrier_state, so those would not issue an IPI.
|
||||
* Membarrier requires a memory barrier after accessing
|
||||
* user-space memory, before clearing tsk->mm or the
|
||||
* rq->membarrier_state.
|
||||
*/
|
||||
smp_mb__after_spinlock();
|
||||
local_irq_disable();
|
||||
current->mm = NULL;
|
||||
mmap_read_unlock(mm);
|
||||
membarrier_update_current_mm(NULL);
|
||||
enter_lazy_tlb(mm, current);
|
||||
local_irq_enable();
|
||||
task_unlock(current);
|
||||
mmap_read_unlock(mm);
|
||||
mm_update_next_owner(mm);
|
||||
mmput(mm);
|
||||
if (test_thread_flag(TIF_MEMDIE))
|
||||
|
@ -31,10 +31,10 @@ static bool irq_work_claim(struct irq_work *work)
|
||||
{
|
||||
int oflags;
|
||||
|
||||
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
|
||||
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
|
||||
/*
|
||||
* If the work is already pending, no need to raise the IPI.
|
||||
* The pairing atomic_fetch_andnot() in irq_work_run() makes sure
|
||||
* The pairing smp_mb() in irq_work_single() makes sure
|
||||
* everything we did before is visible.
|
||||
*/
|
||||
if (oflags & IRQ_WORK_PENDING)
|
||||
@ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void)
|
||||
static void __irq_work_queue_local(struct irq_work *work)
|
||||
{
|
||||
/* If the work is "lazy", handle it from next tick if any */
|
||||
if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
|
||||
if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
|
||||
if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
|
||||
if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
|
||||
tick_nohz_tick_stopped())
|
||||
arch_irq_work_raise();
|
||||
} else {
|
||||
if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
|
||||
if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
|
||||
arch_irq_work_raise();
|
||||
}
|
||||
}
|
||||
@ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
|
||||
if (cpu != smp_processor_id()) {
|
||||
/* Arch remote IPI send/receive backend aren't NMI safe */
|
||||
WARN_ON_ONCE(in_nmi());
|
||||
__smp_call_single_queue(cpu, &work->llnode);
|
||||
__smp_call_single_queue(cpu, &work->node.llist);
|
||||
} else {
|
||||
__irq_work_queue_local(work);
|
||||
}
|
||||
@ -136,23 +136,28 @@ void irq_work_single(void *arg)
|
||||
int flags;
|
||||
|
||||
/*
|
||||
* Clear the PENDING bit, after this point the @work
|
||||
* can be re-used.
|
||||
* Make it immediately visible so that other CPUs trying
|
||||
* to claim that work don't rely on us to handle their data
|
||||
* while we are in the middle of the func.
|
||||
*/
|
||||
flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
|
||||
|
||||
lockdep_irq_work_enter(work);
|
||||
work->func(work);
|
||||
lockdep_irq_work_exit(work);
|
||||
/*
|
||||
* Clear the BUSY bit and return to the free state if
|
||||
* no-one else claimed it meanwhile.
|
||||
* Clear the PENDING bit, after this point the @work can be re-used.
|
||||
* The PENDING bit acts as a lock, and we own it, so we can clear it
|
||||
* without atomic ops.
|
||||
*/
|
||||
flags = atomic_read(&work->node.a_flags);
|
||||
flags &= ~IRQ_WORK_PENDING;
|
||||
(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
|
||||
atomic_set(&work->node.a_flags, flags);
|
||||
|
||||
/*
|
||||
* See irq_work_claim().
|
||||
*/
|
||||
smp_mb();
|
||||
|
||||
lockdep_irq_work_enter(flags);
|
||||
work->func(work);
|
||||
lockdep_irq_work_exit(flags);
|
||||
|
||||
/*
|
||||
* Clear the BUSY bit, if set, and return to the free state if no-one
|
||||
* else claimed it meanwhile.
|
||||
*/
|
||||
(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
|
||||
}
|
||||
|
||||
static void irq_work_run_list(struct llist_head *list)
|
||||
@ -166,7 +171,7 @@ static void irq_work_run_list(struct llist_head *list)
|
||||
return;
|
||||
|
||||
llnode = llist_del_all(list);
|
||||
llist_for_each_entry_safe(work, tmp, llnode, llnode)
|
||||
llist_for_each_entry_safe(work, tmp, llnode, node.llist)
|
||||
irq_work_single(work);
|
||||
}
|
||||
|
||||
@ -198,7 +203,7 @@ void irq_work_sync(struct irq_work *work)
|
||||
{
|
||||
lockdep_assert_irqs_enabled();
|
||||
|
||||
while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
|
||||
while (irq_work_is_busy(work))
|
||||
cpu_relax();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_work_sync);
|
||||
|
@ -1249,6 +1249,7 @@ void kthread_use_mm(struct mm_struct *mm)
|
||||
tsk->active_mm = mm;
|
||||
}
|
||||
tsk->mm = mm;
|
||||
membarrier_update_current_mm(mm);
|
||||
switch_mm_irqs_off(active_mm, mm, tsk);
|
||||
local_irq_enable();
|
||||
task_unlock(tsk);
|
||||
@ -1256,8 +1257,19 @@ void kthread_use_mm(struct mm_struct *mm)
|
||||
finish_arch_post_lock_switch();
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When a kthread starts operating on an address space, the loop
|
||||
* in membarrier_{private,global}_expedited() may not observe
|
||||
* that tsk->mm, and not issue an IPI. Membarrier requires a
|
||||
* memory barrier after storing to tsk->mm, before accessing
|
||||
* user-space memory. A full memory barrier for membarrier
|
||||
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
|
||||
* mmdrop(), or explicitly with smp_mb().
|
||||
*/
|
||||
if (active_mm != mm)
|
||||
mmdrop(active_mm);
|
||||
else
|
||||
smp_mb();
|
||||
|
||||
to_kthread(tsk)->oldfs = force_uaccess_begin();
|
||||
}
|
||||
@ -1277,9 +1289,18 @@ void kthread_unuse_mm(struct mm_struct *mm)
|
||||
force_uaccess_end(to_kthread(tsk)->oldfs);
|
||||
|
||||
task_lock(tsk);
|
||||
/*
|
||||
* When a kthread stops operating on an address space, the loop
|
||||
* in membarrier_{private,global}_expedited() may not observe
|
||||
* that tsk->mm, and not issue an IPI. Membarrier requires a
|
||||
* memory barrier after accessing user-space memory, before
|
||||
* clearing tsk->mm.
|
||||
*/
|
||||
smp_mb__after_spinlock();
|
||||
sync_mm_rss(mm);
|
||||
local_irq_disable();
|
||||
tsk->mm = NULL;
|
||||
membarrier_update_current_mm(NULL);
|
||||
/* active_mm is still 'mm' */
|
||||
enter_lazy_tlb(mm, tsk);
|
||||
local_irq_enable();
|
||||
|
@ -3025,10 +3025,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
|
||||
wake_up_interruptible(&log_wait);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
|
||||
.func = wake_up_klogd_work_func,
|
||||
.flags = ATOMIC_INIT(IRQ_WORK_LAZY),
|
||||
};
|
||||
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
|
||||
IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
|
||||
|
||||
void wake_up_klogd(void)
|
||||
{
|
||||
|
@ -1322,8 +1322,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
|
||||
if (IS_ENABLED(CONFIG_IRQ_WORK) &&
|
||||
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
|
||||
(rnp->ffmask & rdp->grpmask)) {
|
||||
init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
|
||||
atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ);
|
||||
rdp->rcu_iw_pending = true;
|
||||
rdp->rcu_iw_gp_seq = rnp->gp_seq;
|
||||
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
|
||||
@ -4023,6 +4021,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
|
||||
rdp->cpu_no_qs.b.norm = true;
|
||||
rdp->core_needs_qs = false;
|
||||
rdp->rcu_iw_pending = false;
|
||||
rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
|
||||
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
|
||||
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
|
||||
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
|
||||
|
1104
kernel/sched/core.c
1104
kernel/sched/core.c
File diff suppressed because it is too large
Load Diff
@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
||||
const struct sched_dl_entity *dl_se = &p->dl;
|
||||
|
||||
if (later_mask &&
|
||||
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
|
||||
cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
|
||||
unsigned long cap, max_cap = 0;
|
||||
int cpu, max_cpu = -1;
|
||||
|
||||
@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
||||
|
||||
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
|
||||
|
||||
if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
|
||||
if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
|
||||
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
|
||||
if (later_mask)
|
||||
cpumask_set_cpu(best_cpu, later_mask);
|
||||
|
@ -899,16 +899,9 @@ struct cpufreq_governor *cpufreq_default_governor(void)
|
||||
cpufreq_governor_init(schedutil_gov);
|
||||
|
||||
#ifdef CONFIG_ENERGY_MODEL
|
||||
extern bool sched_energy_update;
|
||||
extern struct mutex sched_energy_mutex;
|
||||
|
||||
static void rebuild_sd_workfn(struct work_struct *work)
|
||||
{
|
||||
mutex_lock(&sched_energy_mutex);
|
||||
sched_energy_update = true;
|
||||
rebuild_sched_domains();
|
||||
sched_energy_update = false;
|
||||
mutex_unlock(&sched_energy_mutex);
|
||||
rebuild_sched_domains_energy();
|
||||
}
|
||||
static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
* This code tracks the priority of each CPU so that global migration
|
||||
* decisions are easy to calculate. Each CPU can be in a state as follows:
|
||||
*
|
||||
* (INVALID), IDLE, NORMAL, RT1, ... RT99
|
||||
* (INVALID), NORMAL, RT1, ... RT99, HIGHER
|
||||
*
|
||||
* going from the lowest priority to the highest. CPUs in the INVALID state
|
||||
* are not eligible for routing. The system maintains this state with
|
||||
@ -19,24 +19,48 @@
|
||||
* in that class). Therefore a typical application without affinity
|
||||
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
|
||||
* searches). For tasks with affinity restrictions, the algorithm has a
|
||||
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
|
||||
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
|
||||
* yields the worst case search is fairly contrived.
|
||||
*/
|
||||
#include "sched.h"
|
||||
|
||||
/* Convert between a 140 based task->prio, and our 102 based cpupri */
|
||||
/*
|
||||
* p->rt_priority p->prio newpri cpupri
|
||||
*
|
||||
* -1 -1 (CPUPRI_INVALID)
|
||||
*
|
||||
* 99 0 (CPUPRI_NORMAL)
|
||||
*
|
||||
* 1 98 98 1
|
||||
* ...
|
||||
* 49 50 50 49
|
||||
* 50 49 49 50
|
||||
* ...
|
||||
* 99 0 0 99
|
||||
*
|
||||
* 100 100 (CPUPRI_HIGHER)
|
||||
*/
|
||||
static int convert_prio(int prio)
|
||||
{
|
||||
int cpupri;
|
||||
|
||||
if (prio == CPUPRI_INVALID)
|
||||
cpupri = CPUPRI_INVALID;
|
||||
else if (prio == MAX_PRIO)
|
||||
cpupri = CPUPRI_IDLE;
|
||||
else if (prio >= MAX_RT_PRIO)
|
||||
cpupri = CPUPRI_NORMAL;
|
||||
else
|
||||
cpupri = MAX_RT_PRIO - prio + 1;
|
||||
switch (prio) {
|
||||
case CPUPRI_INVALID:
|
||||
cpupri = CPUPRI_INVALID; /* -1 */
|
||||
break;
|
||||
|
||||
case 0 ... 98:
|
||||
cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
|
||||
break;
|
||||
|
||||
case MAX_RT_PRIO-1:
|
||||
cpupri = CPUPRI_NORMAL; /* 0 */
|
||||
break;
|
||||
|
||||
case MAX_RT_PRIO:
|
||||
cpupri = CPUPRI_HIGHER; /* 100 */
|
||||
break;
|
||||
}
|
||||
|
||||
return cpupri;
|
||||
}
|
||||
@ -73,11 +97,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
|
||||
if (skip)
|
||||
return 0;
|
||||
|
||||
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
|
||||
if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
|
||||
return 0;
|
||||
|
||||
if (lowest_mask) {
|
||||
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
|
||||
cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
|
||||
|
||||
/*
|
||||
* We have to ensure that we have at least one bit
|
||||
@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
|
||||
* cpupri_set - update the CPU priority setting
|
||||
* @cp: The cpupri context
|
||||
* @cpu: The target CPU
|
||||
* @newpri: The priority (INVALID-RT99) to assign to this CPU
|
||||
* @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
|
||||
*
|
||||
* Note: Assumes cpu_rq(cpu)->lock is locked
|
||||
*
|
||||
|
@ -1,11 +1,11 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
|
||||
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
|
||||
|
||||
#define CPUPRI_INVALID -1
|
||||
#define CPUPRI_IDLE 0
|
||||
#define CPUPRI_NORMAL 1
|
||||
/* values 2-101 are RT priorities 0-99 */
|
||||
#define CPUPRI_NORMAL 0
|
||||
/* values 1-99 are for RT1-RT99 priorities */
|
||||
#define CPUPRI_HIGHER 100
|
||||
|
||||
struct cpupri_vec {
|
||||
atomic_t count;
|
||||
|
@ -119,6 +119,17 @@ static inline unsigned long dl_bw_capacity(int i)
|
||||
return __dl_bw_capacity(i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool dl_bw_visited(int cpu, u64 gen)
|
||||
{
|
||||
struct root_domain *rd = cpu_rq(cpu)->rd;
|
||||
|
||||
if (rd->visit_gen == gen)
|
||||
return true;
|
||||
|
||||
rd->visit_gen = gen;
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
static inline struct dl_bw *dl_bw_of(int i)
|
||||
{
|
||||
@ -134,6 +145,11 @@ static inline unsigned long dl_bw_capacity(int i)
|
||||
{
|
||||
return SCHED_CAPACITY_SCALE;
|
||||
}
|
||||
|
||||
static inline bool dl_bw_visited(int cpu, u64 gen)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline
|
||||
@ -565,7 +581,7 @@ static int push_dl_task(struct rq *rq);
|
||||
|
||||
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
return dl_task(prev);
|
||||
return rq->online && dl_task(prev);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct callback_head, dl_push_head);
|
||||
@ -1397,6 +1413,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
|
||||
|
||||
if (dl_rq->earliest_dl.curr == 0 ||
|
||||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
|
||||
if (dl_rq->earliest_dl.curr == 0)
|
||||
cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
|
||||
dl_rq->earliest_dl.curr = deadline;
|
||||
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
|
||||
}
|
||||
@ -1414,6 +1432,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
|
||||
dl_rq->earliest_dl.curr = 0;
|
||||
dl_rq->earliest_dl.next = 0;
|
||||
cpudl_clear(&rq->rd->cpudl, rq->cpu);
|
||||
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
|
||||
} else {
|
||||
struct rb_node *leftmost = dl_rq->root.rb_leftmost;
|
||||
struct sched_dl_entity *entry;
|
||||
@ -1670,13 +1689,13 @@ static void yield_task_dl(struct rq *rq)
|
||||
static int find_later_rq(struct task_struct *task);
|
||||
|
||||
static int
|
||||
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
select_task_rq_dl(struct task_struct *p, int cpu, int flags)
|
||||
{
|
||||
struct task_struct *curr;
|
||||
bool select_rq;
|
||||
struct rq *rq;
|
||||
|
||||
if (sd_flag != SD_BALANCE_WAKE)
|
||||
if (!(flags & WF_TTWU))
|
||||
goto out;
|
||||
|
||||
rq = cpu_rq(cpu);
|
||||
@ -1918,7 +1937,7 @@ static void task_fork_dl(struct task_struct *p)
|
||||
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
|
||||
{
|
||||
if (!task_running(rq, p) &&
|
||||
cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
cpumask_test_cpu(cpu, &p->cpus_mask))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
@ -2008,7 +2027,7 @@ static int find_later_rq(struct task_struct *task)
|
||||
return this_cpu;
|
||||
}
|
||||
|
||||
best_cpu = cpumask_first_and(later_mask,
|
||||
best_cpu = cpumask_any_and_distribute(later_mask,
|
||||
sched_domain_span(sd));
|
||||
/*
|
||||
* Last chance: if a CPU being in both later_mask
|
||||
@ -2031,7 +2050,7 @@ static int find_later_rq(struct task_struct *task)
|
||||
if (this_cpu != -1)
|
||||
return this_cpu;
|
||||
|
||||
cpu = cpumask_any(later_mask);
|
||||
cpu = cpumask_any_distribute(later_mask);
|
||||
if (cpu < nr_cpu_ids)
|
||||
return cpu;
|
||||
|
||||
@ -2068,7 +2087,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
|
||||
/* Retry if something changed. */
|
||||
if (double_lock_balance(rq, later_rq)) {
|
||||
if (unlikely(task_rq(task) != rq ||
|
||||
!cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
|
||||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
|
||||
task_running(rq, task) ||
|
||||
!dl_task(task) ||
|
||||
!task_on_rq_queued(task))) {
|
||||
@ -2135,6 +2154,9 @@ static int push_dl_task(struct rq *rq)
|
||||
return 0;
|
||||
|
||||
retry:
|
||||
if (is_migration_disabled(next_task))
|
||||
return 0;
|
||||
|
||||
if (WARN_ON(next_task == rq->curr))
|
||||
return 0;
|
||||
|
||||
@ -2212,7 +2234,7 @@ static void push_dl_tasks(struct rq *rq)
|
||||
static void pull_dl_task(struct rq *this_rq)
|
||||
{
|
||||
int this_cpu = this_rq->cpu, cpu;
|
||||
struct task_struct *p;
|
||||
struct task_struct *p, *push_task;
|
||||
bool resched = false;
|
||||
struct rq *src_rq;
|
||||
u64 dmin = LONG_MAX;
|
||||
@ -2242,6 +2264,7 @@ static void pull_dl_task(struct rq *this_rq)
|
||||
continue;
|
||||
|
||||
/* Might drop this_rq->lock */
|
||||
push_task = NULL;
|
||||
double_lock_balance(this_rq, src_rq);
|
||||
|
||||
/*
|
||||
@ -2273,17 +2296,27 @@ static void pull_dl_task(struct rq *this_rq)
|
||||
src_rq->curr->dl.deadline))
|
||||
goto skip;
|
||||
|
||||
resched = true;
|
||||
|
||||
if (is_migration_disabled(p)) {
|
||||
push_task = get_push_task(src_rq);
|
||||
} else {
|
||||
deactivate_task(src_rq, p, 0);
|
||||
set_task_cpu(p, this_cpu);
|
||||
activate_task(this_rq, p, 0);
|
||||
dmin = p->dl.deadline;
|
||||
resched = true;
|
||||
}
|
||||
|
||||
/* Is there any other task even earlier? */
|
||||
}
|
||||
skip:
|
||||
double_unlock_balance(this_rq, src_rq);
|
||||
|
||||
if (push_task) {
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
|
||||
push_task, &src_rq->push_work);
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
}
|
||||
}
|
||||
|
||||
if (resched)
|
||||
@ -2307,7 +2340,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
|
||||
}
|
||||
|
||||
static void set_cpus_allowed_dl(struct task_struct *p,
|
||||
const struct cpumask *new_mask)
|
||||
const struct cpumask *new_mask,
|
||||
u32 flags)
|
||||
{
|
||||
struct root_domain *src_rd;
|
||||
struct rq *rq;
|
||||
@ -2336,7 +2370,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
|
||||
raw_spin_unlock(&src_dl_b->lock);
|
||||
}
|
||||
|
||||
set_cpus_allowed_common(p, new_mask);
|
||||
set_cpus_allowed_common(p, new_mask, flags);
|
||||
}
|
||||
|
||||
/* Assumes rq->lock is held */
|
||||
@ -2509,8 +2543,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
|
||||
}
|
||||
}
|
||||
|
||||
const struct sched_class dl_sched_class
|
||||
__section("__dl_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(dl) = {
|
||||
|
||||
.enqueue_task = enqueue_task_dl,
|
||||
.dequeue_task = dequeue_task_dl,
|
||||
.yield_task = yield_task_dl,
|
||||
@ -2529,6 +2563,7 @@ const struct sched_class dl_sched_class
|
||||
.rq_online = rq_online_dl,
|
||||
.rq_offline = rq_offline_dl,
|
||||
.task_woken = task_woken_dl,
|
||||
.find_lock_rq = find_lock_later_rq,
|
||||
#endif
|
||||
|
||||
.task_tick = task_tick_dl,
|
||||
@ -2541,33 +2576,39 @@ const struct sched_class dl_sched_class
|
||||
.update_curr = update_curr_dl,
|
||||
};
|
||||
|
||||
/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
|
||||
static u64 dl_generation;
|
||||
|
||||
int sched_dl_global_validate(void)
|
||||
{
|
||||
u64 runtime = global_rt_runtime();
|
||||
u64 period = global_rt_period();
|
||||
u64 new_bw = to_ratio(period, runtime);
|
||||
u64 gen = ++dl_generation;
|
||||
struct dl_bw *dl_b;
|
||||
int cpu, ret = 0;
|
||||
int cpu, cpus, ret = 0;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* Here we want to check the bandwidth not being set to some
|
||||
* value smaller than the currently allocated bandwidth in
|
||||
* any of the root_domains.
|
||||
*
|
||||
* FIXME: Cycling on all the CPUs is overdoing, but simpler than
|
||||
* cycling on root_domains... Discussion on different/better
|
||||
* solutions is welcome!
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
rcu_read_lock_sched();
|
||||
|
||||
if (dl_bw_visited(cpu, gen))
|
||||
goto next;
|
||||
|
||||
dl_b = dl_bw_of(cpu);
|
||||
cpus = dl_bw_cpus(cpu);
|
||||
|
||||
raw_spin_lock_irqsave(&dl_b->lock, flags);
|
||||
if (new_bw < dl_b->total_bw)
|
||||
if (new_bw * cpus < dl_b->total_bw)
|
||||
ret = -EBUSY;
|
||||
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
|
||||
|
||||
next:
|
||||
rcu_read_unlock_sched();
|
||||
|
||||
if (ret)
|
||||
@ -2593,6 +2634,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
|
||||
void sched_dl_do_global(void)
|
||||
{
|
||||
u64 new_bw = -1;
|
||||
u64 gen = ++dl_generation;
|
||||
struct dl_bw *dl_b;
|
||||
int cpu;
|
||||
unsigned long flags;
|
||||
@ -2603,11 +2645,14 @@ void sched_dl_do_global(void)
|
||||
if (global_rt_runtime() != RUNTIME_INF)
|
||||
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
|
||||
|
||||
/*
|
||||
* FIXME: As above...
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
rcu_read_lock_sched();
|
||||
|
||||
if (dl_bw_visited(cpu, gen)) {
|
||||
rcu_read_unlock_sched();
|
||||
continue;
|
||||
}
|
||||
|
||||
dl_b = dl_bw_of(cpu);
|
||||
|
||||
raw_spin_lock_irqsave(&dl_b->lock, flags);
|
||||
|
@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
if (!schedstat_enabled())
|
||||
return;
|
||||
|
||||
/*
|
||||
* When the sched_schedstat changes from 0 to 1, some sched se
|
||||
* maybe already in the runqueue, the se->statistics.wait_start
|
||||
* will be 0.So it will let the delta wrong. We need to avoid this
|
||||
* scenario.
|
||||
*/
|
||||
if (unlikely(!schedstat_val(se->statistics.wait_start)))
|
||||
return;
|
||||
|
||||
delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
|
||||
|
||||
if (entity_is_task(se)) {
|
||||
@ -1550,7 +1559,8 @@ struct task_numa_env {
|
||||
static unsigned long cpu_load(struct rq *rq);
|
||||
static unsigned long cpu_runnable(struct rq *rq);
|
||||
static unsigned long cpu_util(int cpu);
|
||||
static inline long adjust_numa_imbalance(int imbalance, int nr_running);
|
||||
static inline long adjust_numa_imbalance(int imbalance,
|
||||
int dst_running, int dst_weight);
|
||||
|
||||
static inline enum
|
||||
numa_type numa_classify(unsigned int imbalance_pct,
|
||||
@ -1930,7 +1940,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
|
||||
src_running = env->src_stats.nr_running - 1;
|
||||
dst_running = env->dst_stats.nr_running + 1;
|
||||
imbalance = max(0, dst_running - src_running);
|
||||
imbalance = adjust_numa_imbalance(imbalance, dst_running);
|
||||
imbalance = adjust_numa_imbalance(imbalance, dst_running,
|
||||
env->dst_stats.weight);
|
||||
|
||||
/* Use idle CPU if there is no imbalance */
|
||||
if (!imbalance) {
|
||||
@ -4779,25 +4790,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
|
||||
/* throttled entity or throttle-on-deactivate */
|
||||
if (!se->on_rq)
|
||||
break;
|
||||
goto done;
|
||||
|
||||
if (dequeue) {
|
||||
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
|
||||
} else {
|
||||
update_load_avg(qcfs_rq, se, 0);
|
||||
se_update_runnable(se);
|
||||
}
|
||||
|
||||
qcfs_rq->h_nr_running -= task_delta;
|
||||
qcfs_rq->idle_h_nr_running -= idle_task_delta;
|
||||
|
||||
if (qcfs_rq->load.weight)
|
||||
dequeue = 0;
|
||||
if (qcfs_rq->load.weight) {
|
||||
/* Avoid re-evaluating load for this entity: */
|
||||
se = parent_entity(se);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!se)
|
||||
for_each_sched_entity(se) {
|
||||
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
|
||||
/* throttled entity or throttle-on-deactivate */
|
||||
if (!se->on_rq)
|
||||
goto done;
|
||||
|
||||
update_load_avg(qcfs_rq, se, 0);
|
||||
se_update_runnable(se);
|
||||
|
||||
qcfs_rq->h_nr_running -= task_delta;
|
||||
qcfs_rq->idle_h_nr_running -= idle_task_delta;
|
||||
}
|
||||
|
||||
/* At this point se is NULL and we are at root level*/
|
||||
sub_nr_running(rq, task_delta);
|
||||
|
||||
done:
|
||||
/*
|
||||
* Note: distribution will already see us throttled via the
|
||||
* throttled-list. rq->lock protects completion.
|
||||
@ -5105,9 +5128,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
||||
return;
|
||||
|
||||
distribute_cfs_runtime(cfs_b);
|
||||
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -5805,6 +5825,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
|
||||
if (sync && cpu_rq(this_cpu)->nr_running == 1)
|
||||
return this_cpu;
|
||||
|
||||
if (available_idle_cpu(prev_cpu))
|
||||
return prev_cpu;
|
||||
|
||||
return nr_cpumask_bits;
|
||||
}
|
||||
|
||||
@ -6063,10 +6086,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
|
||||
break;
|
||||
}
|
||||
}
|
||||
cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
|
||||
|
||||
if (idle)
|
||||
return core;
|
||||
|
||||
cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -6307,7 +6331,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
}
|
||||
|
||||
/**
|
||||
* Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
|
||||
* cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
|
||||
* @cpu: the CPU to get the utilization of
|
||||
*
|
||||
* The unit of the return value must be the one of capacity so we can compare
|
||||
@ -6683,7 +6707,7 @@ fail:
|
||||
|
||||
/*
|
||||
* select_task_rq_fair: Select target runqueue for the waking task in domains
|
||||
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
|
||||
* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
|
||||
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
|
||||
*
|
||||
* Balances load by selecting the idlest CPU in the idlest group, or under
|
||||
@ -6694,15 +6718,17 @@ fail:
|
||||
* preempt must be disabled.
|
||||
*/
|
||||
static int
|
||||
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
|
||||
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
|
||||
{
|
||||
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
|
||||
struct sched_domain *tmp, *sd = NULL;
|
||||
int cpu = smp_processor_id();
|
||||
int new_cpu = prev_cpu;
|
||||
int want_affine = 0;
|
||||
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
|
||||
/* SD_flags and WF_flags share the first nibble */
|
||||
int sd_flag = wake_flags & 0xF;
|
||||
|
||||
if (sd_flag & SD_BALANCE_WAKE) {
|
||||
if (wake_flags & WF_TTWU) {
|
||||
record_wakee(p);
|
||||
|
||||
if (sched_energy_enabled()) {
|
||||
@ -6739,9 +6765,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
||||
if (unlikely(sd)) {
|
||||
/* Slow path */
|
||||
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
|
||||
} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
|
||||
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
|
||||
/* Fast path */
|
||||
|
||||
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
||||
|
||||
if (want_affine)
|
||||
@ -8757,6 +8782,16 @@ static bool update_pick_idlest(struct sched_group *idlest,
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
|
||||
* This is an approximation as the number of running tasks may not be
|
||||
* related to the number of busy CPUs due to sched_setaffinity.
|
||||
*/
|
||||
static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
|
||||
{
|
||||
return (dst_running < (dst_weight >> 2));
|
||||
}
|
||||
|
||||
/*
|
||||
* find_idlest_group() finds and returns the least busy CPU group within the
|
||||
* domain.
|
||||
@ -8775,9 +8810,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
|
||||
.group_type = group_overloaded,
|
||||
};
|
||||
|
||||
imbalance = scale_load_down(NICE_0_LOAD) *
|
||||
(sd->imbalance_pct-100) / 100;
|
||||
|
||||
do {
|
||||
int local_group;
|
||||
|
||||
@ -8831,6 +8863,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
|
||||
switch (local_sgs.group_type) {
|
||||
case group_overloaded:
|
||||
case group_fully_busy:
|
||||
|
||||
/* Calculate allowed imbalance based on load */
|
||||
imbalance = scale_load_down(NICE_0_LOAD) *
|
||||
(sd->imbalance_pct-100) / 100;
|
||||
|
||||
/*
|
||||
* When comparing groups across NUMA domains, it's possible for
|
||||
* the local domain to be very lightly loaded relative to the
|
||||
@ -8887,7 +8924,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
|
||||
* a real need of migration, periodic load balance will
|
||||
* take care of it.
|
||||
*/
|
||||
if (local_sgs.idle_cpus)
|
||||
if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -8989,16 +9026,19 @@ next_group:
|
||||
}
|
||||
}
|
||||
|
||||
static inline long adjust_numa_imbalance(int imbalance, int nr_running)
|
||||
#define NUMA_IMBALANCE_MIN 2
|
||||
|
||||
static inline long adjust_numa_imbalance(int imbalance,
|
||||
int dst_running, int dst_weight)
|
||||
{
|
||||
unsigned int imbalance_min;
|
||||
if (!allow_numa_imbalance(dst_running, dst_weight))
|
||||
return imbalance;
|
||||
|
||||
/*
|
||||
* Allow a small imbalance based on a simple pair of communicating
|
||||
* tasks that remain local when the source domain is almost idle.
|
||||
* tasks that remain local when the destination is lightly loaded.
|
||||
*/
|
||||
imbalance_min = 2;
|
||||
if (nr_running <= imbalance_min)
|
||||
if (imbalance <= NUMA_IMBALANCE_MIN)
|
||||
return 0;
|
||||
|
||||
return imbalance;
|
||||
@ -9101,9 +9141,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
||||
}
|
||||
|
||||
/* Consider allowing a small imbalance between NUMA groups */
|
||||
if (env->sd->flags & SD_NUMA)
|
||||
if (env->sd->flags & SD_NUMA) {
|
||||
env->imbalance = adjust_numa_imbalance(env->imbalance,
|
||||
busiest->sum_nr_running);
|
||||
busiest->sum_nr_running, busiest->group_weight);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
@ -10068,6 +10109,10 @@ static inline int find_new_ilb(void)
|
||||
|
||||
for_each_cpu_and(ilb, nohz.idle_cpus_mask,
|
||||
housekeeping_cpumask(HK_FLAG_MISC)) {
|
||||
|
||||
if (ilb == smp_processor_id())
|
||||
continue;
|
||||
|
||||
if (idle_cpu(ilb))
|
||||
return ilb;
|
||||
}
|
||||
@ -10505,7 +10550,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
|
||||
#endif /* CONFIG_NO_HZ_COMMON */
|
||||
|
||||
/*
|
||||
* idle_balance is called by schedule() if this_cpu is about to become
|
||||
* newidle_balance is called by schedule() if this_cpu is about to become
|
||||
* idle. Attempts to pull tasks from other CPUs.
|
||||
*
|
||||
* Returns:
|
||||
@ -11179,8 +11224,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
|
||||
/*
|
||||
* All the scheduling class methods:
|
||||
*/
|
||||
const struct sched_class fair_sched_class
|
||||
__section("__fair_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(fair) = {
|
||||
|
||||
.enqueue_task = enqueue_task_fair,
|
||||
.dequeue_task = dequeue_task_fair,
|
||||
.yield_task = yield_task_fair,
|
||||
|
@ -364,6 +364,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
|
||||
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
|
||||
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
|
||||
WARN_ON_ONCE(!duration_ns);
|
||||
WARN_ON_ONCE(current->mm);
|
||||
|
||||
rcu_sleep_check();
|
||||
preempt_disable();
|
||||
@ -401,7 +402,7 @@ void cpu_startup_entry(enum cpuhp_state state)
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static int
|
||||
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
select_task_rq_idle(struct task_struct *p, int cpu, int flags)
|
||||
{
|
||||
return task_cpu(p); /* IDLE tasks as never migrated */
|
||||
}
|
||||
@ -483,8 +484,8 @@ static void update_curr_idle(struct rq *rq)
|
||||
/*
|
||||
* Simple, special scheduling class for the per-CPU idle tasks:
|
||||
*/
|
||||
const struct sched_class idle_sched_class
|
||||
__section("__idle_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(idle) = {
|
||||
|
||||
/* no enqueue/yield_task for idle tasks */
|
||||
|
||||
/* dequeue is not valid, we print a debug message there: */
|
||||
|
@ -6,6 +6,134 @@
|
||||
*/
|
||||
#include "sched.h"
|
||||
|
||||
/*
|
||||
* For documentation purposes, here are some membarrier ordering
|
||||
* scenarios to keep in mind:
|
||||
*
|
||||
* A) Userspace thread execution after IPI vs membarrier's memory
|
||||
* barrier before sending the IPI
|
||||
*
|
||||
* Userspace variables:
|
||||
*
|
||||
* int x = 0, y = 0;
|
||||
*
|
||||
* The memory barrier at the start of membarrier() on CPU0 is necessary in
|
||||
* order to enforce the guarantee that any writes occurring on CPU0 before
|
||||
* the membarrier() is executed will be visible to any code executing on
|
||||
* CPU1 after the IPI-induced memory barrier:
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* x = 1
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* b: send IPI IPI-induced mb
|
||||
* c: smp_mb()
|
||||
* r2 = y
|
||||
* y = 1
|
||||
* barrier()
|
||||
* r1 = x
|
||||
*
|
||||
* BUG_ON(r1 == 0 && r2 == 0)
|
||||
*
|
||||
* The write to y and load from x by CPU1 are unordered by the hardware,
|
||||
* so it's possible to have "r1 = x" reordered before "y = 1" at any
|
||||
* point after (b). If the memory barrier at (a) is omitted, then "x = 1"
|
||||
* can be reordered after (a) (although not after (c)), so we get r1 == 0
|
||||
* and r2 == 0. This violates the guarantee that membarrier() is
|
||||
* supposed by provide.
|
||||
*
|
||||
* The timing of the memory barrier at (a) has to ensure that it executes
|
||||
* before the IPI-induced memory barrier on CPU1.
|
||||
*
|
||||
* B) Userspace thread execution before IPI vs membarrier's memory
|
||||
* barrier after completing the IPI
|
||||
*
|
||||
* Userspace variables:
|
||||
*
|
||||
* int x = 0, y = 0;
|
||||
*
|
||||
* The memory barrier at the end of membarrier() on CPU0 is necessary in
|
||||
* order to enforce the guarantee that any writes occurring on CPU1 before
|
||||
* the membarrier() is executed will be visible to any code executing on
|
||||
* CPU0 after the membarrier():
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* x = 1
|
||||
* barrier()
|
||||
* y = 1
|
||||
* r2 = y
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* b: send IPI IPI-induced mb
|
||||
* c: smp_mb()
|
||||
* r1 = x
|
||||
* BUG_ON(r1 == 0 && r2 == 1)
|
||||
*
|
||||
* The writes to x and y are unordered by the hardware, so it's possible to
|
||||
* have "r2 = 1" even though the write to x doesn't execute until (b). If
|
||||
* the memory barrier at (c) is omitted then "r1 = x" can be reordered
|
||||
* before (b) (although not before (a)), so we get "r1 = 0". This violates
|
||||
* the guarantee that membarrier() is supposed to provide.
|
||||
*
|
||||
* The timing of the memory barrier at (c) has to ensure that it executes
|
||||
* after the IPI-induced memory barrier on CPU1.
|
||||
*
|
||||
* C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* d: switch to kthread (includes mb)
|
||||
* b: read rq->curr->mm == NULL
|
||||
* e: switch to user (includes mb)
|
||||
* c: smp_mb()
|
||||
*
|
||||
* Using the scenario from (A), we can show that (a) needs to be paired
|
||||
* with (e). Using the scenario from (B), we can show that (c) needs to
|
||||
* be paired with (d).
|
||||
*
|
||||
* D) exit_mm vs membarrier
|
||||
*
|
||||
* Two thread groups are created, A and B. Thread group B is created by
|
||||
* issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
|
||||
* Let's assume we have a single thread within each thread group (Thread A
|
||||
* and Thread B). Thread A runs on CPU0, Thread B runs on CPU1.
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* exit_mm():
|
||||
* d: smp_mb()
|
||||
* e: current->mm = NULL
|
||||
* b: read rq->curr->mm == NULL
|
||||
* c: smp_mb()
|
||||
*
|
||||
* Using scenario (B), we can show that (c) needs to be paired with (d).
|
||||
*
|
||||
* E) kthread_{use,unuse}_mm vs membarrier
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* kthread_unuse_mm()
|
||||
* d: smp_mb()
|
||||
* e: current->mm = NULL
|
||||
* b: read rq->curr->mm == NULL
|
||||
* kthread_use_mm()
|
||||
* f: current->mm = mm
|
||||
* g: smp_mb()
|
||||
* c: smp_mb()
|
||||
*
|
||||
* Using the scenario from (A), we can show that (a) needs to be paired
|
||||
* with (g). Using the scenario from (B), we can show that (c) needs to
|
||||
* be paired with (d).
|
||||
*/
|
||||
|
||||
/*
|
||||
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
|
||||
* except MEMBARRIER_CMD_QUERY.
|
||||
@ -101,6 +229,18 @@ void membarrier_exec_mmap(struct mm_struct *mm)
|
||||
this_cpu_write(runqueues.membarrier_state, 0);
|
||||
}
|
||||
|
||||
void membarrier_update_current_mm(struct mm_struct *next_mm)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
int membarrier_state = 0;
|
||||
|
||||
if (next_mm)
|
||||
membarrier_state = atomic_read(&next_mm->membarrier_state);
|
||||
if (READ_ONCE(rq->membarrier_state) == membarrier_state)
|
||||
return;
|
||||
WRITE_ONCE(rq->membarrier_state, membarrier_state);
|
||||
}
|
||||
|
||||
static int membarrier_global_expedited(void)
|
||||
{
|
||||
int cpu;
|
||||
@ -139,12 +279,11 @@ static int membarrier_global_expedited(void)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Skip the CPU if it runs a kernel thread. The scheduler
|
||||
* leaves the prior task mm in place as an optimization when
|
||||
* scheduling a kthread.
|
||||
* Skip the CPU if it runs a kernel thread which is not using
|
||||
* a task mm.
|
||||
*/
|
||||
p = rcu_dereference(cpu_rq(cpu)->curr);
|
||||
if (p->flags & PF_KTHREAD)
|
||||
if (!p->mm)
|
||||
continue;
|
||||
|
||||
__cpumask_set_cpu(cpu, tmpmask);
|
||||
|
@ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq)
|
||||
__set_bit(MAX_RT_PRIO, array->bitmap);
|
||||
|
||||
#if defined CONFIG_SMP
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
||||
rt_rq->highest_prio.next = MAX_RT_PRIO;
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
|
||||
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
|
||||
rt_rq->rt_nr_migratory = 0;
|
||||
rt_rq->overloaded = 0;
|
||||
plist_head_init(&rt_rq->pushable_tasks);
|
||||
@ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
|
||||
rt_rq->rt_nr_boosted = 0;
|
||||
rt_rq->rq = rq;
|
||||
rt_rq->tg = tg;
|
||||
@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq);
|
||||
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
/* Try to pull RT tasks here if we lower this rq's prio */
|
||||
return rq->rt.highest_prio.curr > prev->prio;
|
||||
return rq->online && rq->rt.highest_prio.curr > prev->prio;
|
||||
}
|
||||
|
||||
static inline int rt_overloaded(struct rq *rq)
|
||||
@ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
|
||||
p = plist_first_entry(&rq->rt.pushable_tasks,
|
||||
struct task_struct, pushable_tasks);
|
||||
rq->rt.highest_prio.next = p->prio;
|
||||
} else
|
||||
rq->rt.highest_prio.next = MAX_RT_PRIO;
|
||||
} else {
|
||||
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
@ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
|
||||
sched_find_first_bit(array->bitmap);
|
||||
}
|
||||
|
||||
} else
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
||||
} else {
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
|
||||
}
|
||||
|
||||
dec_rt_prio_smp(rt_rq, prio, prev_prio);
|
||||
}
|
||||
@ -1428,14 +1430,14 @@ static void yield_task_rt(struct rq *rq)
|
||||
static int find_lowest_rq(struct task_struct *task);
|
||||
|
||||
static int
|
||||
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
select_task_rq_rt(struct task_struct *p, int cpu, int flags)
|
||||
{
|
||||
struct task_struct *curr;
|
||||
struct rq *rq;
|
||||
bool test;
|
||||
|
||||
/* For anything but wake ups, just return the task_cpu */
|
||||
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
|
||||
if (!(flags & (WF_TTWU | WF_FORK)))
|
||||
goto out;
|
||||
|
||||
rq = cpu_rq(cpu);
|
||||
@ -1658,7 +1660,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
|
||||
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
|
||||
{
|
||||
if (!task_running(rq, p) &&
|
||||
cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
cpumask_test_cpu(cpu, &p->cpus_mask))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
@ -1752,7 +1754,7 @@ static int find_lowest_rq(struct task_struct *task)
|
||||
return this_cpu;
|
||||
}
|
||||
|
||||
best_cpu = cpumask_first_and(lowest_mask,
|
||||
best_cpu = cpumask_any_and_distribute(lowest_mask,
|
||||
sched_domain_span(sd));
|
||||
if (best_cpu < nr_cpu_ids) {
|
||||
rcu_read_unlock();
|
||||
@ -1770,7 +1772,7 @@ static int find_lowest_rq(struct task_struct *task)
|
||||
if (this_cpu != -1)
|
||||
return this_cpu;
|
||||
|
||||
cpu = cpumask_any(lowest_mask);
|
||||
cpu = cpumask_any_distribute(lowest_mask);
|
||||
if (cpu < nr_cpu_ids)
|
||||
return cpu;
|
||||
|
||||
@ -1811,7 +1813,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
|
||||
* Also make sure that it wasn't scheduled on its rq.
|
||||
*/
|
||||
if (unlikely(task_rq(task) != rq ||
|
||||
!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
|
||||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
|
||||
task_running(rq, task) ||
|
||||
!rt_task(task) ||
|
||||
!task_on_rq_queued(task))) {
|
||||
@ -1859,7 +1861,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
|
||||
* running task can migrate over to a CPU that is running a task
|
||||
* of lesser priority.
|
||||
*/
|
||||
static int push_rt_task(struct rq *rq)
|
||||
static int push_rt_task(struct rq *rq, bool pull)
|
||||
{
|
||||
struct task_struct *next_task;
|
||||
struct rq *lowest_rq;
|
||||
@ -1873,6 +1875,34 @@ static int push_rt_task(struct rq *rq)
|
||||
return 0;
|
||||
|
||||
retry:
|
||||
if (is_migration_disabled(next_task)) {
|
||||
struct task_struct *push_task = NULL;
|
||||
int cpu;
|
||||
|
||||
if (!pull || rq->push_busy)
|
||||
return 0;
|
||||
|
||||
cpu = find_lowest_rq(rq->curr);
|
||||
if (cpu == -1 || cpu == rq->cpu)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Given we found a CPU with lower priority than @next_task,
|
||||
* therefore it should be running. However we cannot migrate it
|
||||
* to this other CPU, instead attempt to push the current
|
||||
* running task on this CPU away.
|
||||
*/
|
||||
push_task = get_push_task(rq);
|
||||
if (push_task) {
|
||||
raw_spin_unlock(&rq->lock);
|
||||
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
|
||||
push_task, &rq->push_work);
|
||||
raw_spin_lock(&rq->lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (WARN_ON(next_task == rq->curr))
|
||||
return 0;
|
||||
|
||||
@ -1927,12 +1957,10 @@ retry:
|
||||
deactivate_task(rq, next_task, 0);
|
||||
set_task_cpu(next_task, lowest_rq->cpu);
|
||||
activate_task(lowest_rq, next_task, 0);
|
||||
resched_curr(lowest_rq);
|
||||
ret = 1;
|
||||
|
||||
resched_curr(lowest_rq);
|
||||
|
||||
double_unlock_balance(rq, lowest_rq);
|
||||
|
||||
out:
|
||||
put_task_struct(next_task);
|
||||
|
||||
@ -1942,7 +1970,7 @@ out:
|
||||
static void push_rt_tasks(struct rq *rq)
|
||||
{
|
||||
/* push_rt_task will return true if it moved an RT */
|
||||
while (push_rt_task(rq))
|
||||
while (push_rt_task(rq, false))
|
||||
;
|
||||
}
|
||||
|
||||
@ -2095,7 +2123,8 @@ void rto_push_irq_work_func(struct irq_work *work)
|
||||
*/
|
||||
if (has_pushable_tasks(rq)) {
|
||||
raw_spin_lock(&rq->lock);
|
||||
push_rt_tasks(rq);
|
||||
while (push_rt_task(rq, true))
|
||||
;
|
||||
raw_spin_unlock(&rq->lock);
|
||||
}
|
||||
|
||||
@ -2120,7 +2149,7 @@ static void pull_rt_task(struct rq *this_rq)
|
||||
{
|
||||
int this_cpu = this_rq->cpu, cpu;
|
||||
bool resched = false;
|
||||
struct task_struct *p;
|
||||
struct task_struct *p, *push_task;
|
||||
struct rq *src_rq;
|
||||
int rt_overload_count = rt_overloaded(this_rq);
|
||||
|
||||
@ -2167,6 +2196,7 @@ static void pull_rt_task(struct rq *this_rq)
|
||||
* double_lock_balance, and another CPU could
|
||||
* alter this_rq
|
||||
*/
|
||||
push_task = NULL;
|
||||
double_lock_balance(this_rq, src_rq);
|
||||
|
||||
/*
|
||||
@ -2194,11 +2224,14 @@ static void pull_rt_task(struct rq *this_rq)
|
||||
if (p->prio < src_rq->curr->prio)
|
||||
goto skip;
|
||||
|
||||
resched = true;
|
||||
|
||||
if (is_migration_disabled(p)) {
|
||||
push_task = get_push_task(src_rq);
|
||||
} else {
|
||||
deactivate_task(src_rq, p, 0);
|
||||
set_task_cpu(p, this_cpu);
|
||||
activate_task(this_rq, p, 0);
|
||||
resched = true;
|
||||
}
|
||||
/*
|
||||
* We continue with the search, just in
|
||||
* case there's an even higher prio task
|
||||
@ -2208,6 +2241,13 @@ static void pull_rt_task(struct rq *this_rq)
|
||||
}
|
||||
skip:
|
||||
double_unlock_balance(this_rq, src_rq);
|
||||
|
||||
if (push_task) {
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
|
||||
push_task, &src_rq->push_work);
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
}
|
||||
}
|
||||
|
||||
if (resched)
|
||||
@ -2429,8 +2469,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct sched_class rt_sched_class
|
||||
__section("__rt_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(rt) = {
|
||||
|
||||
.enqueue_task = enqueue_task_rt,
|
||||
.dequeue_task = dequeue_task_rt,
|
||||
.yield_task = yield_task_rt,
|
||||
@ -2449,6 +2489,7 @@ const struct sched_class rt_sched_class
|
||||
.rq_offline = rq_offline_rt,
|
||||
.task_woken = task_woken_rt,
|
||||
.switched_from = switched_from_rt,
|
||||
.find_lock_rq = find_lock_lowest_rq,
|
||||
#endif
|
||||
|
||||
.task_tick = task_tick_rt,
|
||||
|
@ -67,7 +67,6 @@
|
||||
#include <linux/tsacct_kern.h>
|
||||
|
||||
#include <asm/tlb.h>
|
||||
#include <asm-generic/vmlinux.lds.h>
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
# include <asm/paravirt.h>
|
||||
@ -257,30 +256,6 @@ struct rt_bandwidth {
|
||||
|
||||
void __dl_clear_params(struct task_struct *p);
|
||||
|
||||
/*
|
||||
* To keep the bandwidth of -deadline tasks and groups under control
|
||||
* we need some place where:
|
||||
* - store the maximum -deadline bandwidth of the system (the group);
|
||||
* - cache the fraction of that bandwidth that is currently allocated.
|
||||
*
|
||||
* This is all done in the data structure below. It is similar to the
|
||||
* one used for RT-throttling (rt_bandwidth), with the main difference
|
||||
* that, since here we are only interested in admission control, we
|
||||
* do not decrease any runtime while the group "executes", neither we
|
||||
* need a timer to replenish it.
|
||||
*
|
||||
* With respect to SMP, the bandwidth is given on a per-CPU basis,
|
||||
* meaning that:
|
||||
* - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
|
||||
* - dl_total_bw array contains, in the i-eth element, the currently
|
||||
* allocated bandwidth on the i-eth CPU.
|
||||
* Moreover, groups consume bandwidth on each CPU, while tasks only
|
||||
* consume bandwidth on the CPU they're running on.
|
||||
* Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
|
||||
* that will be shown the next time the proc or cgroup controls will
|
||||
* be red. It on its turn can be changed by writing on its own
|
||||
* control.
|
||||
*/
|
||||
struct dl_bandwidth {
|
||||
raw_spinlock_t dl_runtime_lock;
|
||||
u64 dl_runtime;
|
||||
@ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void)
|
||||
return sysctl_sched_rt_runtime >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* To keep the bandwidth of -deadline tasks under control
|
||||
* we need some place where:
|
||||
* - store the maximum -deadline bandwidth of each cpu;
|
||||
* - cache the fraction of bandwidth that is currently allocated in
|
||||
* each root domain;
|
||||
*
|
||||
* This is all done in the data structure below. It is similar to the
|
||||
* one used for RT-throttling (rt_bandwidth), with the main difference
|
||||
* that, since here we are only interested in admission control, we
|
||||
* do not decrease any runtime while the group "executes", neither we
|
||||
* need a timer to replenish it.
|
||||
*
|
||||
* With respect to SMP, bandwidth is given on a per root domain basis,
|
||||
* meaning that:
|
||||
* - bw (< 100%) is the deadline bandwidth of each CPU;
|
||||
* - total_bw is the currently allocated bandwidth in each root domain;
|
||||
*/
|
||||
struct dl_bw {
|
||||
raw_spinlock_t lock;
|
||||
u64 bw;
|
||||
@ -801,6 +794,15 @@ struct root_domain {
|
||||
struct dl_bw dl_bw;
|
||||
struct cpudl cpudl;
|
||||
|
||||
/*
|
||||
* Indicate whether a root_domain's dl_bw has been checked or
|
||||
* updated. It's monotonously increasing value.
|
||||
*
|
||||
* Also, some corner cases, like 'wrap around' is dangerous, but given
|
||||
* that u64 is 'big enough'. So that shouldn't be a concern.
|
||||
*/
|
||||
u64 visit_gen;
|
||||
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
/*
|
||||
* For IPI pull requests, loop across the rto_mask.
|
||||
@ -973,6 +975,7 @@ struct rq {
|
||||
unsigned long cpu_capacity_orig;
|
||||
|
||||
struct callback_head *balance_callback;
|
||||
unsigned char balance_flags;
|
||||
|
||||
unsigned char nohz_idle_balance;
|
||||
unsigned char idle_balance;
|
||||
@ -1003,6 +1006,10 @@ struct rq {
|
||||
|
||||
/* This is used to determine avg_idle's max value */
|
||||
u64 max_idle_balance_cost;
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
struct rcuwait hotplug_wait;
|
||||
#endif
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
@ -1048,6 +1055,12 @@ struct rq {
|
||||
/* Must be inspected within a rcu lock section */
|
||||
struct cpuidle_state *idle_state;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned int nr_pinned;
|
||||
#endif
|
||||
unsigned int push_busy;
|
||||
struct cpu_stop_work push_work;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
@ -1075,6 +1088,16 @@ static inline int cpu_of(struct rq *rq)
|
||||
#endif
|
||||
}
|
||||
|
||||
#define MDF_PUSH 0x01
|
||||
|
||||
static inline bool is_migration_disabled(struct task_struct *p)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
return p->migration_disabled;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
extern void __update_idle_core(struct rq *rq);
|
||||
@ -1221,6 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
|
||||
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
|
||||
rf->clock_update_flags = 0;
|
||||
#endif
|
||||
#ifdef CONFIG_SMP
|
||||
SCHED_WARN_ON(rq->balance_callback);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
|
||||
@ -1382,6 +1408,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#define BALANCE_WORK 0x01
|
||||
#define BALANCE_PUSH 0x02
|
||||
|
||||
static inline void
|
||||
queue_balance_callback(struct rq *rq,
|
||||
struct callback_head *head,
|
||||
@ -1389,12 +1418,13 @@ queue_balance_callback(struct rq *rq,
|
||||
{
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
if (unlikely(head->next))
|
||||
if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
|
||||
return;
|
||||
|
||||
head->func = (void (*)(struct callback_head *))func;
|
||||
head->next = rq->balance_callback;
|
||||
rq->balance_callback = head;
|
||||
rq->balance_flags |= BALANCE_WORK;
|
||||
}
|
||||
|
||||
#define rcu_dereference_check_sched_domain(p) \
|
||||
@ -1714,13 +1744,20 @@ static inline int task_on_rq_migrating(struct task_struct *p)
|
||||
return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
|
||||
}
|
||||
|
||||
/*
|
||||
* wake flags
|
||||
*/
|
||||
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
|
||||
#define WF_FORK 0x02 /* Child wakeup after fork */
|
||||
#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
|
||||
#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
|
||||
/* Wake flags. The first three directly map to some SD flag value */
|
||||
#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
|
||||
#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
|
||||
#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
|
||||
|
||||
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
|
||||
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
|
||||
#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static_assert(WF_EXEC == SD_BALANCE_EXEC);
|
||||
static_assert(WF_FORK == SD_BALANCE_FORK);
|
||||
static_assert(WF_TTWU == SD_BALANCE_WAKE);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* To aid in avoiding the subversion of "niceness" due to uneven distribution
|
||||
@ -1796,16 +1833,19 @@ struct sched_class {
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
|
||||
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
|
||||
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
|
||||
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
|
||||
|
||||
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
|
||||
|
||||
void (*set_cpus_allowed)(struct task_struct *p,
|
||||
const struct cpumask *newmask);
|
||||
const struct cpumask *newmask,
|
||||
u32 flags);
|
||||
|
||||
void (*rq_online)(struct rq *rq);
|
||||
void (*rq_offline)(struct rq *rq);
|
||||
|
||||
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
|
||||
#endif
|
||||
|
||||
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
|
||||
@ -1833,7 +1873,7 @@ struct sched_class {
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
void (*task_change_group)(struct task_struct *p, int type);
|
||||
#endif
|
||||
} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
|
||||
};
|
||||
|
||||
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
@ -1847,6 +1887,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
|
||||
next->sched_class->set_next_task(rq, next, false);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Helper to define a sched_class instance; each one is placed in a separate
|
||||
* section which is ordered by the linker script:
|
||||
*
|
||||
* include/asm-generic/vmlinux.lds.h
|
||||
*
|
||||
* Also enforce alignment on the instance, not the type, to guarantee layout.
|
||||
*/
|
||||
#define DEFINE_SCHED_CLASS(name) \
|
||||
const struct sched_class name##_sched_class \
|
||||
__aligned(__alignof__(struct sched_class)) \
|
||||
__section("__" #name "_sched_class")
|
||||
|
||||
/* Defined in include/asm-generic/vmlinux.lds.h */
|
||||
extern struct sched_class __begin_sched_classes[];
|
||||
extern struct sched_class __end_sched_classes[];
|
||||
@ -1889,13 +1943,35 @@ static inline bool sched_fair_runnable(struct rq *rq)
|
||||
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
|
||||
extern struct task_struct *pick_next_task_idle(struct rq *rq);
|
||||
|
||||
#define SCA_CHECK 0x01
|
||||
#define SCA_MIGRATE_DISABLE 0x02
|
||||
#define SCA_MIGRATE_ENABLE 0x04
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
extern void update_group_capacity(struct sched_domain *sd, int cpu);
|
||||
|
||||
extern void trigger_load_balance(struct rq *rq);
|
||||
|
||||
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
|
||||
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
|
||||
|
||||
static inline struct task_struct *get_push_task(struct rq *rq)
|
||||
{
|
||||
struct task_struct *p = rq->curr;
|
||||
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
if (rq->push_busy)
|
||||
return NULL;
|
||||
|
||||
if (p->nr_cpus_allowed == 1)
|
||||
return NULL;
|
||||
|
||||
rq->push_busy = true;
|
||||
return get_task_struct(p);
|
||||
}
|
||||
|
||||
extern int push_cpu_stop(void *arg);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static int
|
||||
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
select_task_rq_stop(struct task_struct *p, int cpu, int flags)
|
||||
{
|
||||
return task_cpu(p); /* stop tasks as never migrate */
|
||||
}
|
||||
@ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq)
|
||||
/*
|
||||
* Simple, special scheduling class for the per-CPU stop tasks:
|
||||
*/
|
||||
const struct sched_class stop_sched_class
|
||||
__section("__stop_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(stop) = {
|
||||
|
||||
.enqueue_task = enqueue_task_stop,
|
||||
.dequeue_task = dequeue_task_stop,
|
||||
|
@ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1;
|
||||
DEFINE_MUTEX(sched_energy_mutex);
|
||||
bool sched_energy_update;
|
||||
|
||||
void rebuild_sched_domains_energy(void)
|
||||
{
|
||||
mutex_lock(&sched_energy_mutex);
|
||||
sched_energy_update = true;
|
||||
rebuild_sched_domains();
|
||||
sched_energy_update = false;
|
||||
mutex_unlock(&sched_energy_mutex);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_SYSCTL
|
||||
int sched_energy_aware_handler(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
@ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
|
||||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (!ret && write) {
|
||||
state = static_branch_unlikely(&sched_energy_present);
|
||||
if (state != sysctl_sched_energy_aware) {
|
||||
mutex_lock(&sched_energy_mutex);
|
||||
sched_energy_update = 1;
|
||||
rebuild_sched_domains();
|
||||
sched_energy_update = 0;
|
||||
mutex_unlock(&sched_energy_mutex);
|
||||
}
|
||||
if (state != sysctl_sched_energy_aware)
|
||||
rebuild_sched_domains_energy();
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -324,6 +328,7 @@ static void sched_energy_set(bool has_eas)
|
||||
* 3. no SMT is detected.
|
||||
* 4. the EM complexity is low enough to keep scheduling overheads low;
|
||||
* 5. schedutil is driving the frequency of all CPUs of the rd;
|
||||
* 6. frequency invariance support is present;
|
||||
*
|
||||
* The complexity of the Energy Model is defined as:
|
||||
*
|
||||
@ -372,6 +377,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
|
||||
goto free;
|
||||
}
|
||||
|
||||
if (!arch_scale_freq_invariant()) {
|
||||
if (sched_debug()) {
|
||||
pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
|
||||
cpumask_pr_args(cpu_map));
|
||||
}
|
||||
goto free;
|
||||
}
|
||||
|
||||
for_each_cpu(i, cpu_map) {
|
||||
/* Skip already covered CPUs. */
|
||||
if (find_pd(pd, i))
|
||||
@ -516,6 +529,7 @@ static int init_rootdomain(struct root_domain *rd)
|
||||
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
|
||||
#endif
|
||||
|
||||
rd->visit_gen = 0;
|
||||
init_dl_bw(&rd->dl_bw);
|
||||
if (cpudl_init(&rd->cpudl) != 0)
|
||||
goto free_rto_mask;
|
||||
@ -674,6 +688,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct sched_domain *tmp;
|
||||
int numa_distance = 0;
|
||||
|
||||
/* Remove the sched domains which do not contribute to scheduling. */
|
||||
for (tmp = sd; tmp; ) {
|
||||
@ -705,6 +720,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
||||
sd->child = NULL;
|
||||
}
|
||||
|
||||
for (tmp = sd; tmp; tmp = tmp->parent)
|
||||
numa_distance += !!(tmp->flags & SD_NUMA);
|
||||
|
||||
/*
|
||||
* FIXME: Diameter >=3 is misrepresented.
|
||||
*
|
||||
* Smallest diameter=3 topology is:
|
||||
*
|
||||
* node 0 1 2 3
|
||||
* 0: 10 20 30 40
|
||||
* 1: 20 10 20 30
|
||||
* 2: 30 20 10 20
|
||||
* 3: 40 30 20 10
|
||||
*
|
||||
* 0 --- 1 --- 2 --- 3
|
||||
*
|
||||
* NUMA-3 0-3 N/A N/A 0-3
|
||||
* groups: {0-2},{1-3} {1-3},{0-2}
|
||||
*
|
||||
* NUMA-2 0-2 0-3 0-3 1-3
|
||||
* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
|
||||
*
|
||||
* NUMA-1 0-1 0-2 1-3 2-3
|
||||
* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
|
||||
*
|
||||
* NUMA-0 0 1 2 3
|
||||
*
|
||||
* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
|
||||
* group span isn't a subset of the domain span.
|
||||
*/
|
||||
WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
|
||||
|
||||
sched_domain_debug(sd, cpu);
|
||||
|
||||
rq_attach_root(rq, rd);
|
||||
|
52
kernel/smp.c
52
kernel/smp.c
@ -27,7 +27,7 @@
|
||||
#include "smpboot.h"
|
||||
#include "sched/smp.h"
|
||||
|
||||
#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK)
|
||||
#define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
|
||||
|
||||
struct call_function_data {
|
||||
call_single_data_t __percpu *csd;
|
||||
@ -130,7 +130,7 @@ static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd)
|
||||
|
||||
csd_type = CSD_TYPE(csd);
|
||||
if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
|
||||
return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */
|
||||
return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -146,7 +146,7 @@ static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 t
|
||||
bool firsttime;
|
||||
u64 ts2, ts_delta;
|
||||
call_single_data_t *cpu_cur_csd;
|
||||
unsigned int flags = READ_ONCE(csd->flags);
|
||||
unsigned int flags = READ_ONCE(csd->node.u_flags);
|
||||
|
||||
if (!(flags & CSD_FLAG_LOCK)) {
|
||||
if (!unlikely(*bug_id))
|
||||
@ -224,14 +224,14 @@ static void csd_lock_record(call_single_data_t *csd)
|
||||
|
||||
static __always_inline void csd_lock_wait(call_single_data_t *csd)
|
||||
{
|
||||
smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
|
||||
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
|
||||
}
|
||||
#endif
|
||||
|
||||
static __always_inline void csd_lock(call_single_data_t *csd)
|
||||
{
|
||||
csd_lock_wait(csd);
|
||||
csd->flags |= CSD_FLAG_LOCK;
|
||||
csd->node.u_flags |= CSD_FLAG_LOCK;
|
||||
|
||||
/*
|
||||
* prevent CPU from reordering the above assignment
|
||||
@ -243,12 +243,12 @@ static __always_inline void csd_lock(call_single_data_t *csd)
|
||||
|
||||
static __always_inline void csd_unlock(call_single_data_t *csd)
|
||||
{
|
||||
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
|
||||
WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
|
||||
|
||||
/*
|
||||
* ensure we're all done before releasing data:
|
||||
*/
|
||||
smp_store_release(&csd->flags, 0);
|
||||
smp_store_release(&csd->node.u_flags, 0);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
|
||||
@ -300,7 +300,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd)
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
__smp_call_single_queue(cpu, &csd->llist);
|
||||
__smp_call_single_queue(cpu, &csd->node.llist);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -353,7 +353,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||
* We don't have to use the _safe() variant here
|
||||
* because we are not invoking the IPI handlers yet.
|
||||
*/
|
||||
llist_for_each_entry(csd, entry, llist) {
|
||||
llist_for_each_entry(csd, entry, node.llist) {
|
||||
switch (CSD_TYPE(csd)) {
|
||||
case CSD_TYPE_ASYNC:
|
||||
case CSD_TYPE_SYNC:
|
||||
@ -378,16 +378,16 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||
* First; run all SYNC callbacks, people are waiting for us.
|
||||
*/
|
||||
prev = NULL;
|
||||
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
|
||||
llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
|
||||
/* Do we wait until *after* callback? */
|
||||
if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
|
||||
smp_call_func_t func = csd->func;
|
||||
void *info = csd->info;
|
||||
|
||||
if (prev) {
|
||||
prev->next = &csd_next->llist;
|
||||
prev->next = &csd_next->node.llist;
|
||||
} else {
|
||||
entry = &csd_next->llist;
|
||||
entry = &csd_next->node.llist;
|
||||
}
|
||||
|
||||
csd_lock_record(csd);
|
||||
@ -395,7 +395,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||
csd_unlock(csd);
|
||||
csd_lock_record(NULL);
|
||||
} else {
|
||||
prev = &csd->llist;
|
||||
prev = &csd->node.llist;
|
||||
}
|
||||
}
|
||||
|
||||
@ -406,14 +406,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||
* Second; run all !SYNC callbacks.
|
||||
*/
|
||||
prev = NULL;
|
||||
llist_for_each_entry_safe(csd, csd_next, entry, llist) {
|
||||
llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
|
||||
int type = CSD_TYPE(csd);
|
||||
|
||||
if (type != CSD_TYPE_TTWU) {
|
||||
if (prev) {
|
||||
prev->next = &csd_next->llist;
|
||||
prev->next = &csd_next->node.llist;
|
||||
} else {
|
||||
entry = &csd_next->llist;
|
||||
entry = &csd_next->node.llist;
|
||||
}
|
||||
|
||||
if (type == CSD_TYPE_ASYNC) {
|
||||
@ -429,7 +429,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
|
||||
}
|
||||
|
||||
} else {
|
||||
prev = &csd->llist;
|
||||
prev = &csd->node.llist;
|
||||
}
|
||||
}
|
||||
|
||||
@ -465,7 +465,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
|
||||
{
|
||||
call_single_data_t *csd;
|
||||
call_single_data_t csd_stack = {
|
||||
.flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
|
||||
.node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
|
||||
};
|
||||
int this_cpu;
|
||||
int err;
|
||||
@ -502,8 +502,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
|
||||
csd->func = func;
|
||||
csd->info = info;
|
||||
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
|
||||
csd->src = smp_processor_id();
|
||||
csd->dst = cpu;
|
||||
csd->node.src = smp_processor_id();
|
||||
csd->node.dst = cpu;
|
||||
#endif
|
||||
|
||||
err = generic_exec_single(cpu, csd);
|
||||
@ -544,12 +544,12 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
|
||||
|
||||
preempt_disable();
|
||||
|
||||
if (csd->flags & CSD_FLAG_LOCK) {
|
||||
if (csd->node.u_flags & CSD_FLAG_LOCK) {
|
||||
err = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
csd->flags = CSD_FLAG_LOCK;
|
||||
csd->node.u_flags = CSD_FLAG_LOCK;
|
||||
smp_wmb();
|
||||
|
||||
err = generic_exec_single(cpu, csd);
|
||||
@ -667,14 +667,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
|
||||
|
||||
csd_lock(csd);
|
||||
if (wait)
|
||||
csd->flags |= CSD_TYPE_SYNC;
|
||||
csd->node.u_flags |= CSD_TYPE_SYNC;
|
||||
csd->func = func;
|
||||
csd->info = info;
|
||||
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
|
||||
csd->src = smp_processor_id();
|
||||
csd->dst = cpu;
|
||||
csd->node.src = smp_processor_id();
|
||||
csd->node.dst = cpu;
|
||||
#endif
|
||||
if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
|
||||
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu)))
|
||||
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
|
||||
}
|
||||
|
||||
|
@ -42,11 +42,27 @@ struct cpu_stopper {
|
||||
struct list_head works; /* list of pending works */
|
||||
|
||||
struct cpu_stop_work stop_work; /* for stop_cpus */
|
||||
unsigned long caller;
|
||||
cpu_stop_fn_t fn;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
|
||||
static bool stop_machine_initialized = false;
|
||||
|
||||
void print_stop_info(const char *log_lvl, struct task_struct *task)
|
||||
{
|
||||
/*
|
||||
* If @task is a stopper task, it cannot migrate and task_cpu() is
|
||||
* stable.
|
||||
*/
|
||||
struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
|
||||
|
||||
if (task != stopper->thread)
|
||||
return;
|
||||
|
||||
printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
|
||||
}
|
||||
|
||||
/* static data for stop_cpus */
|
||||
static DEFINE_MUTEX(stop_cpus_mutex);
|
||||
static bool stop_cpus_in_progress;
|
||||
@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
|
||||
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
|
||||
{
|
||||
struct cpu_stop_done done;
|
||||
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
|
||||
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
|
||||
|
||||
cpu_stop_init_done(&done, 1);
|
||||
if (!cpu_stop_queue_work(cpu, &work))
|
||||
@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
|
||||
work1 = work2 = (struct cpu_stop_work){
|
||||
.fn = multi_cpu_stop,
|
||||
.arg = &msdata,
|
||||
.done = &done
|
||||
.done = &done,
|
||||
.caller = _RET_IP_,
|
||||
};
|
||||
|
||||
cpu_stop_init_done(&done, 2);
|
||||
@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
|
||||
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
|
||||
struct cpu_stop_work *work_buf)
|
||||
{
|
||||
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
|
||||
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
|
||||
return cpu_stop_queue_work(cpu, work_buf);
|
||||
}
|
||||
|
||||
@ -487,6 +504,8 @@ repeat:
|
||||
int ret;
|
||||
|
||||
/* cpu stop callbacks must not sleep, make in_atomic() == T */
|
||||
stopper->caller = work->caller;
|
||||
stopper->fn = fn;
|
||||
preempt_count_inc();
|
||||
ret = fn(arg);
|
||||
if (done) {
|
||||
@ -495,6 +514,8 @@ repeat:
|
||||
cpu_stop_signal_done(done);
|
||||
}
|
||||
preempt_count_dec();
|
||||
stopper->fn = NULL;
|
||||
stopper->caller = 0;
|
||||
WARN_ONCE(preempt_count(),
|
||||
"cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
|
||||
goto repeat;
|
||||
|
@ -293,10 +293,8 @@ static void nohz_full_kick_func(struct irq_work *work)
|
||||
/* Empty, the tick restart happens on tick_nohz_irq_exit() */
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
|
||||
.func = nohz_full_kick_func,
|
||||
.flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ),
|
||||
};
|
||||
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
|
||||
IRQ_WORK_INIT_HARD(nohz_full_kick_func);
|
||||
|
||||
/*
|
||||
* Kick this CPU if it's full dynticks in order to force it to
|
||||
|
@ -1096,7 +1096,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
|
||||
return -EINVAL;
|
||||
|
||||
work = this_cpu_ptr(&send_signal_work);
|
||||
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
|
||||
if (irq_work_is_busy(&work->irq_work))
|
||||
return -EBUSY;
|
||||
|
||||
/* Add the current task, which is the target of sending signal,
|
||||
|
@ -4908,6 +4908,10 @@ static void unbind_workers(int cpu)
|
||||
pool->flags |= POOL_DISASSOCIATED;
|
||||
|
||||
raw_spin_unlock_irq(&pool->lock);
|
||||
|
||||
for_each_pool_worker(worker, pool)
|
||||
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
|
||||
|
||||
mutex_unlock(&wq_pool_attach_mutex);
|
||||
|
||||
/*
|
||||
|
@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
|
||||
return next;
|
||||
}
|
||||
EXPORT_SYMBOL(cpumask_any_and_distribute);
|
||||
|
||||
int cpumask_any_distribute(const struct cpumask *srcp)
|
||||
{
|
||||
int next, prev;
|
||||
|
||||
/* NOTE: our first selection will skip 0. */
|
||||
prev = __this_cpu_read(distribute_cpu_mask_prev);
|
||||
|
||||
next = cpumask_next(prev, srcp);
|
||||
if (next >= nr_cpu_ids)
|
||||
next = cpumask_first(srcp);
|
||||
|
||||
if (next < nr_cpu_ids)
|
||||
__this_cpu_write(distribute_cpu_mask_prev, next);
|
||||
|
||||
return next;
|
||||
}
|
||||
EXPORT_SYMBOL(cpumask_any_distribute);
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/kexec.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/stop_machine.h>
|
||||
|
||||
static char dump_stack_arch_desc_str[128];
|
||||
|
||||
@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl)
|
||||
log_lvl, dump_stack_arch_desc_str);
|
||||
|
||||
print_worker_info(log_lvl, current);
|
||||
print_stop_info(log_lvl, current);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
|
||||
if (current->nr_cpus_allowed == 1)
|
||||
goto out;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (current->migration_disabled)
|
||||
goto out;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* It is valid to assume CPU-locality during early bootup:
|
||||
*/
|
||||
|
@ -11179,8 +11179,7 @@ static int __init net_dev_init(void)
|
||||
INIT_LIST_HEAD(&sd->poll_list);
|
||||
sd->output_queue_tailp = &sd->output_queue;
|
||||
#ifdef CONFIG_RPS
|
||||
sd->csd.func = rps_trigger_softirq;
|
||||
sd->csd.info = sd;
|
||||
INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
|
||||
sd->cpu = i;
|
||||
#endif
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user