mirror of
https://github.com/torvalds/linux.git
synced 2024-11-23 04:31:50 +00:00
cgroup: Changes for v6.13
- cpu.stat now also shows niced CPU time. - Freezer and cpuset optimizations. - Other misc changes. -----BEGIN PGP SIGNATURE----- iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCZztlgg4cdGpAa2VybmVs Lm9yZwAKCRCxYfJx3gVYGbohAQDE/enqpAX9vSOpQPne4ZzgcPlGTrCwBcka3Z5z 4aOF0AD/SmdjcJ/EULisD/2O27ovsGAtqDjngrrZwNUTbCNkTQQ= =pKyo -----END PGP SIGNATURE----- Merge tag 'cgroup-for-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup Pull cgroup updates from Tejun Heo: - cpu.stat now also shows niced CPU time - Freezer and cpuset optimizations - Other misc changes * tag 'cgroup-for-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup/cpuset: Disable cpuset_cpumask_can_shrink() test if not load balancing cgroup/cpuset: Further optimize code if CONFIG_CPUSETS_V1 not set cgroup/cpuset: Enforce at most one rebuild_sched_domains_locked() call per operation cgroup/cpuset: Revert "Allow suppression of sched domain rebuild in update_cpumasks_hier()" MAINTAINERS: remove Zefan Li cgroup/freezer: Add cgroup CGRP_FROZEN flag update helper cgroup/freezer: Reduce redundant traversal for cgroup_freeze cgroup/bpf: only cgroup v2 can be attached by bpf programs Revert "cgroup: Fix memory leak caused by missing cgroup_bpf_offline" selftests/cgroup: Fix compile error in test_cpu.c cgroup/rstat: Selftests for niced CPU statistics cgroup/rstat: Tracking cgroup-level niced CPU time cgroup/cpuset: Fix spelling errors in file kernel/cgroup/cpuset.c
This commit is contained in:
commit
7586d52765
3
CREDITS
3
CREDITS
@ -579,6 +579,9 @@ N: Zach Brown
|
||||
E: zab@zabbo.net
|
||||
D: maestro pci sound
|
||||
|
||||
N: Zefan Li
|
||||
D: Contribution to control group stuff
|
||||
|
||||
N: David Brownell
|
||||
D: Kernel engineer, mentor, and friend. Maintained USB EHCI and
|
||||
D: gadget layers, SPI subsystem, GPIO subsystem, and more than a few
|
||||
|
@ -5756,7 +5756,6 @@ F: kernel/context_tracking.c
|
||||
|
||||
CONTROL GROUP (CGROUP)
|
||||
M: Tejun Heo <tj@kernel.org>
|
||||
M: Zefan Li <lizefan.x@bytedance.com>
|
||||
M: Johannes Weiner <hannes@cmpxchg.org>
|
||||
M: Michal Koutný <mkoutny@suse.com>
|
||||
L: cgroups@vger.kernel.org
|
||||
@ -5785,7 +5784,6 @@ F: include/linux/blk-cgroup.h
|
||||
|
||||
CONTROL GROUP - CPUSET
|
||||
M: Waiman Long <longman@redhat.com>
|
||||
M: Zefan Li <lizefan.x@bytedance.com>
|
||||
L: cgroups@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
|
||||
|
@ -327,6 +327,7 @@ struct cgroup_base_stat {
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
u64 forceidle_sum;
|
||||
#endif
|
||||
u64 ntime;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -397,7 +398,7 @@ struct cgroup_freezer_state {
|
||||
bool freeze;
|
||||
|
||||
/* Should the cgroup actually be frozen? */
|
||||
int e_freeze;
|
||||
bool e_freeze;
|
||||
|
||||
/* Fields below are protected by css_set_lock */
|
||||
|
||||
|
@ -2140,8 +2140,10 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
|
||||
if (ret)
|
||||
goto exit_stats;
|
||||
|
||||
ret = cgroup_bpf_inherit(root_cgrp);
|
||||
WARN_ON_ONCE(ret);
|
||||
if (root == &cgrp_dfl_root) {
|
||||
ret = cgroup_bpf_inherit(root_cgrp);
|
||||
WARN_ON_ONCE(ret);
|
||||
}
|
||||
|
||||
trace_cgroup_setup_root(root);
|
||||
|
||||
@ -2314,10 +2316,8 @@ static void cgroup_kill_sb(struct super_block *sb)
|
||||
* And don't kill the default root.
|
||||
*/
|
||||
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
|
||||
!percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
|
||||
cgroup_bpf_offline(&root->cgrp);
|
||||
!percpu_ref_is_dying(&root->cgrp.self.refcnt))
|
||||
percpu_ref_kill(&root->cgrp.self.refcnt);
|
||||
}
|
||||
cgroup_put(&root->cgrp);
|
||||
kernfs_kill_sb(sb);
|
||||
}
|
||||
@ -5710,9 +5710,11 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
|
||||
if (ret)
|
||||
goto out_kernfs_remove;
|
||||
|
||||
ret = cgroup_bpf_inherit(cgrp);
|
||||
if (ret)
|
||||
goto out_psi_free;
|
||||
if (cgrp->root == &cgrp_dfl_root) {
|
||||
ret = cgroup_bpf_inherit(cgrp);
|
||||
if (ret)
|
||||
goto out_psi_free;
|
||||
}
|
||||
|
||||
/*
|
||||
* New cgroup inherits effective freeze counter, and
|
||||
@ -6026,7 +6028,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
||||
|
||||
cgroup1_check_for_release(parent);
|
||||
|
||||
cgroup_bpf_offline(cgrp);
|
||||
if (cgrp->root == &cgrp_dfl_root)
|
||||
cgroup_bpf_offline(cgrp);
|
||||
|
||||
/* put the base reference */
|
||||
percpu_ref_kill(&cgrp->self.refcnt);
|
||||
|
@ -84,9 +84,19 @@ static bool have_boot_isolcpus;
|
||||
static struct list_head remote_children;
|
||||
|
||||
/*
|
||||
* A flag to force sched domain rebuild at the end of an operation while
|
||||
* inhibiting it in the intermediate stages when set. Currently it is only
|
||||
* set in hotplug code.
|
||||
* A flag to force sched domain rebuild at the end of an operation.
|
||||
* It can be set in
|
||||
* - update_partition_sd_lb()
|
||||
* - remote_partition_check()
|
||||
* - update_cpumasks_hier()
|
||||
* - cpuset_update_flag()
|
||||
* - cpuset_hotplug_update_tasks()
|
||||
* - cpuset_handle_hotplug()
|
||||
*
|
||||
* Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
|
||||
*
|
||||
* Note that update_relax_domain_level() in cpuset-v1.c can still call
|
||||
* rebuild_sched_domains_locked() directly without using this flag.
|
||||
*/
|
||||
static bool force_sd_rebuild;
|
||||
|
||||
@ -283,6 +293,12 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
|
||||
mutex_unlock(&cpuset_mutex);
|
||||
}
|
||||
|
||||
static inline bool cpuset_v2(void)
|
||||
{
|
||||
return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
|
||||
cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
|
||||
}
|
||||
|
||||
/*
|
||||
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
|
||||
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
|
||||
@ -293,7 +309,7 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
|
||||
*/
|
||||
static inline bool is_in_v2_mode(void)
|
||||
{
|
||||
return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
||||
return cpuset_v2() ||
|
||||
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
|
||||
}
|
||||
|
||||
@ -565,12 +581,24 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
|
||||
|
||||
/*
|
||||
* We can't shrink if we won't have enough room for SCHED_DEADLINE
|
||||
* tasks.
|
||||
* tasks. This check is not done when scheduling is disabled as the
|
||||
* users should know what they are doing.
|
||||
*
|
||||
* For v1, effective_cpus == cpus_allowed & user_xcpus() returns
|
||||
* cpus_allowed.
|
||||
*
|
||||
* For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
|
||||
* for non-isolated partition root. At this point, the target
|
||||
* effective_cpus isn't computed yet. user_xcpus() is the best
|
||||
* approximation.
|
||||
*
|
||||
* TBD: May need to precompute the real effective_cpus here in case
|
||||
* incorrect scheduling of SCHED_DEADLINE tasks in a partition
|
||||
* becomes an issue.
|
||||
*/
|
||||
ret = -EBUSY;
|
||||
if (is_cpu_exclusive(cur) &&
|
||||
!cpuset_cpumask_can_shrink(cur->cpus_allowed,
|
||||
trial->cpus_allowed))
|
||||
if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
|
||||
!cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
@ -728,7 +756,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
||||
int nslot; /* next empty doms[] struct cpumask slot */
|
||||
struct cgroup_subsys_state *pos_css;
|
||||
bool root_load_balance = is_sched_load_balance(&top_cpuset);
|
||||
bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
|
||||
bool cgrpv2 = cpuset_v2();
|
||||
int nslot_update;
|
||||
|
||||
doms = NULL;
|
||||
@ -990,6 +1018,7 @@ void rebuild_sched_domains_locked(void)
|
||||
|
||||
lockdep_assert_cpus_held();
|
||||
lockdep_assert_held(&cpuset_mutex);
|
||||
force_sd_rebuild = false;
|
||||
|
||||
/*
|
||||
* If we have raced with CPU hotplug, return early to avoid
|
||||
@ -1164,8 +1193,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
|
||||
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
||||
}
|
||||
|
||||
if (rebuild_domains && !force_sd_rebuild)
|
||||
rebuild_sched_domains_locked();
|
||||
if (rebuild_domains)
|
||||
cpuset_force_rebuild();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1187,7 +1216,7 @@ static void reset_partition_data(struct cpuset *cs)
|
||||
{
|
||||
struct cpuset *parent = parent_cs(cs);
|
||||
|
||||
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
|
||||
if (!cpuset_v2())
|
||||
return;
|
||||
|
||||
lockdep_assert_held(&callback_lock);
|
||||
@ -1339,7 +1368,7 @@ static inline bool is_local_partition(struct cpuset *cs)
|
||||
* remote_partition_enable - Enable current cpuset as a remote partition root
|
||||
* @cs: the cpuset to update
|
||||
* @new_prs: new partition_root_state
|
||||
* @tmp: temparary masks
|
||||
* @tmp: temporary masks
|
||||
* Return: 0 if successful, errcode if error
|
||||
*
|
||||
* Enable the current cpuset to become a remote partition root taking CPUs
|
||||
@ -1377,7 +1406,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
|
||||
update_unbound_workqueue_cpumask(isolcpus_updated);
|
||||
|
||||
/*
|
||||
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
|
||||
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
|
||||
*/
|
||||
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
|
||||
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
|
||||
@ -1387,7 +1416,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
|
||||
/*
|
||||
* remote_partition_disable - Remove current cpuset from remote partition list
|
||||
* @cs: the cpuset to update
|
||||
* @tmp: temparary masks
|
||||
* @tmp: temporary masks
|
||||
*
|
||||
* The effective_cpus is also updated.
|
||||
*
|
||||
@ -1413,7 +1442,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
|
||||
update_unbound_workqueue_cpumask(isolcpus_updated);
|
||||
|
||||
/*
|
||||
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
|
||||
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
|
||||
*/
|
||||
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
|
||||
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
|
||||
@ -1423,7 +1452,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
|
||||
* remote_cpus_update - cpus_exclusive change of remote partition
|
||||
* @cs: the cpuset to be updated
|
||||
* @newmask: the new effective_xcpus mask
|
||||
* @tmp: temparary masks
|
||||
* @tmp: temporary masks
|
||||
*
|
||||
* top_cpuset and subpartitions_cpus will be updated or partition can be
|
||||
* invalidated.
|
||||
@ -1465,7 +1494,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
|
||||
update_unbound_workqueue_cpumask(isolcpus_updated);
|
||||
|
||||
/*
|
||||
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
|
||||
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
|
||||
*/
|
||||
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
|
||||
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
|
||||
@ -1480,7 +1509,7 @@ invalidate:
|
||||
* @cs: the cpuset to be updated
|
||||
* @newmask: the new effective_xcpus mask
|
||||
* @delmask: temporary mask for deletion (not in tmp)
|
||||
* @tmp: temparary masks
|
||||
* @tmp: temporary masks
|
||||
*
|
||||
* This should be called before the given cs has updated its cpus_allowed
|
||||
* and/or effective_xcpus.
|
||||
@ -1512,8 +1541,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
|
||||
remote_partition_disable(child, tmp);
|
||||
disable_cnt++;
|
||||
}
|
||||
if (disable_cnt && !force_sd_rebuild)
|
||||
rebuild_sched_domains_locked();
|
||||
if (disable_cnt)
|
||||
cpuset_force_rebuild();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1922,12 +1951,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* update_cpumasks_hier() flags
|
||||
*/
|
||||
#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
|
||||
#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
|
||||
|
||||
/*
|
||||
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
|
||||
* @cs: the cpuset to consider
|
||||
@ -1942,7 +1965,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
|
||||
* Called with cpuset_mutex held
|
||||
*/
|
||||
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
|
||||
int flags)
|
||||
bool force)
|
||||
{
|
||||
struct cpuset *cp;
|
||||
struct cgroup_subsys_state *pos_css;
|
||||
@ -2007,12 +2030,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
|
||||
* Skip the whole subtree if
|
||||
* 1) the cpumask remains the same,
|
||||
* 2) has no partition root state,
|
||||
* 3) HIER_CHECKALL flag not set, and
|
||||
* 3) force flag not set, and
|
||||
* 4) for v2 load balance state same as its parent.
|
||||
*/
|
||||
if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
|
||||
if (!cp->partition_root_state && !force &&
|
||||
cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
|
||||
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
||||
(!cpuset_v2() ||
|
||||
(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
|
||||
pos_css = css_rightmost_descendant(pos_css);
|
||||
continue;
|
||||
@ -2086,8 +2109,7 @@ get_css:
|
||||
* from parent if current cpuset isn't a valid partition root
|
||||
* and their load balance states differ.
|
||||
*/
|
||||
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
||||
!is_partition_valid(cp) &&
|
||||
if (cpuset_v2() && !is_partition_valid(cp) &&
|
||||
(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
|
||||
if (is_sched_load_balance(parent))
|
||||
set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
|
||||
@ -2103,8 +2125,7 @@ get_css:
|
||||
*/
|
||||
if (!cpumask_empty(cp->cpus_allowed) &&
|
||||
is_sched_load_balance(cp) &&
|
||||
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
||||
is_partition_valid(cp)))
|
||||
(!cpuset_v2() || is_partition_valid(cp)))
|
||||
need_rebuild_sched_domains = true;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -2112,9 +2133,8 @@ get_css:
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) &&
|
||||
!force_sd_rebuild)
|
||||
rebuild_sched_domains_locked();
|
||||
if (need_rebuild_sched_domains)
|
||||
cpuset_force_rebuild();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -2141,9 +2161,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
|
||||
* directly.
|
||||
*
|
||||
* The update_cpumasks_hier() function may sleep. So we have to
|
||||
* release the RCU read lock before calling it. HIER_NO_SD_REBUILD
|
||||
* flag is used to suppress rebuild of sched domains as the callers
|
||||
* will take care of that.
|
||||
* release the RCU read lock before calling it.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
cpuset_for_each_child(sibling, pos_css, parent) {
|
||||
@ -2159,7 +2177,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
|
||||
continue;
|
||||
|
||||
rcu_read_unlock();
|
||||
update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
|
||||
update_cpumasks_hier(sibling, tmp, false);
|
||||
rcu_read_lock();
|
||||
css_put(&sibling->css);
|
||||
}
|
||||
@ -2179,7 +2197,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
struct tmpmasks tmp;
|
||||
struct cpuset *parent = parent_cs(cs);
|
||||
bool invalidate = false;
|
||||
int hier_flags = 0;
|
||||
bool force = false;
|
||||
int old_prs = cs->partition_root_state;
|
||||
|
||||
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
|
||||
@ -2206,7 +2224,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* When exclusive_cpus isn't explicitly set, it is constrainted
|
||||
* When exclusive_cpus isn't explicitly set, it is constrained
|
||||
* by cpus_allowed and parent's effective_xcpus. Otherwise,
|
||||
* trialcs->effective_xcpus is used as a temporary cpumask
|
||||
* for checking validity of the partition root.
|
||||
@ -2240,12 +2258,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
* Check all the descendants in update_cpumasks_hier() if
|
||||
* effective_xcpus is to be changed.
|
||||
*/
|
||||
if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
|
||||
hier_flags = HIER_CHECKALL;
|
||||
force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
|
||||
|
||||
retval = validate_change(cs, trialcs);
|
||||
|
||||
if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
|
||||
if ((retval == -EINVAL) && cpuset_v2()) {
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cpuset *cp;
|
||||
|
||||
@ -2309,7 +2326,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
spin_unlock_irq(&callback_lock);
|
||||
|
||||
/* effective_cpus/effective_xcpus will be updated here */
|
||||
update_cpumasks_hier(cs, &tmp, hier_flags);
|
||||
update_cpumasks_hier(cs, &tmp, force);
|
||||
|
||||
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
|
||||
if (cs->partition_root_state)
|
||||
@ -2334,7 +2351,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
struct tmpmasks tmp;
|
||||
struct cpuset *parent = parent_cs(cs);
|
||||
bool invalidate = false;
|
||||
int hier_flags = 0;
|
||||
bool force = false;
|
||||
int old_prs = cs->partition_root_state;
|
||||
|
||||
if (!*buf) {
|
||||
@ -2357,8 +2374,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
* Check all the descendants in update_cpumasks_hier() if
|
||||
* effective_xcpus is to be changed.
|
||||
*/
|
||||
if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
|
||||
hier_flags = HIER_CHECKALL;
|
||||
force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
|
||||
|
||||
retval = validate_change(cs, trialcs);
|
||||
if (retval)
|
||||
@ -2411,8 +2427,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
* of the subtree when it is a valid partition root or effective_xcpus
|
||||
* is updated.
|
||||
*/
|
||||
if (is_partition_valid(cs) || hier_flags)
|
||||
update_cpumasks_hier(cs, &tmp, hier_flags);
|
||||
if (is_partition_valid(cs) || force)
|
||||
update_cpumasks_hier(cs, &tmp, force);
|
||||
|
||||
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
|
||||
if (cs->partition_root_state)
|
||||
@ -2737,9 +2753,12 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
|
||||
cs->flags = trialcs->flags;
|
||||
spin_unlock_irq(&callback_lock);
|
||||
|
||||
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed &&
|
||||
!force_sd_rebuild)
|
||||
rebuild_sched_domains_locked();
|
||||
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
|
||||
if (cpuset_v2())
|
||||
cpuset_force_rebuild();
|
||||
else
|
||||
rebuild_sched_domains_locked();
|
||||
}
|
||||
|
||||
if (spread_flag_changed)
|
||||
cpuset1_update_tasks_flags(cs);
|
||||
@ -2853,12 +2872,14 @@ out:
|
||||
update_unbound_workqueue_cpumask(new_xcpus_state);
|
||||
|
||||
/* Force update if switching back to member */
|
||||
update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
|
||||
update_cpumasks_hier(cs, &tmpmask, !new_prs);
|
||||
|
||||
/* Update sched domains and load balance flag */
|
||||
update_partition_sd_lb(cs, old_prs);
|
||||
|
||||
notify_partition_change(cs, old_prs);
|
||||
if (force_sd_rebuild)
|
||||
rebuild_sched_domains_locked();
|
||||
free_cpumasks(NULL, &tmpmask);
|
||||
return 0;
|
||||
}
|
||||
@ -2919,8 +2940,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
|
||||
* migration permission derives from hierarchy ownership in
|
||||
* cgroup_procs_write_permission()).
|
||||
*/
|
||||
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
|
||||
(cpus_updated || mems_updated)) {
|
||||
if (!cpuset_v2() || (cpus_updated || mems_updated)) {
|
||||
ret = security_task_setscheduler(task);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
@ -3034,8 +3054,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
|
||||
* in effective cpus and mems. In that case, we can optimize out
|
||||
* by skipping the task iteration and update.
|
||||
*/
|
||||
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
||||
!cpus_updated && !mems_updated) {
|
||||
if (cpuset_v2() && !cpus_updated && !mems_updated) {
|
||||
cpuset_attach_nodemask_to = cs->effective_mems;
|
||||
goto out;
|
||||
}
|
||||
@ -3152,6 +3171,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
|
||||
}
|
||||
|
||||
free_cpuset(trialcs);
|
||||
if (force_sd_rebuild)
|
||||
rebuild_sched_domains_locked();
|
||||
out_unlock:
|
||||
mutex_unlock(&cpuset_mutex);
|
||||
cpus_read_unlock();
|
||||
@ -3383,7 +3404,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
INIT_LIST_HEAD(&cs->remote_sibling);
|
||||
|
||||
/* Set CS_MEMORY_MIGRATE for default hierarchy */
|
||||
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
|
||||
if (cpuset_v2())
|
||||
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
|
||||
|
||||
return &cs->css;
|
||||
@ -3410,8 +3431,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
|
||||
/*
|
||||
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
|
||||
*/
|
||||
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
||||
!is_sched_load_balance(parent))
|
||||
if (cpuset_v2() && !is_sched_load_balance(parent))
|
||||
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
||||
|
||||
cpuset_inc();
|
||||
@ -3481,8 +3501,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
|
||||
if (is_partition_valid(cs))
|
||||
update_prstate(cs, 0);
|
||||
|
||||
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
|
||||
is_sched_load_balance(cs))
|
||||
if (!cpuset_v2() && is_sched_load_balance(cs))
|
||||
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
|
||||
|
||||
cpuset_dec();
|
||||
@ -3896,11 +3915,9 @@ static void cpuset_handle_hotplug(void)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* rebuild sched domains if cpus_allowed has changed */
|
||||
if (force_sd_rebuild) {
|
||||
force_sd_rebuild = false;
|
||||
/* rebuild sched domains if necessary */
|
||||
if (force_sd_rebuild)
|
||||
rebuild_sched_domains_cpuslocked();
|
||||
}
|
||||
|
||||
free_cpumasks(NULL, ptmp);
|
||||
}
|
||||
|
@ -8,6 +8,28 @@
|
||||
|
||||
#include <trace/events/cgroup.h>
|
||||
|
||||
/*
|
||||
* Update CGRP_FROZEN of cgroup.flag
|
||||
* Return true if flags is updated; false if flags has no change
|
||||
*/
|
||||
static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen)
|
||||
{
|
||||
lockdep_assert_held(&css_set_lock);
|
||||
|
||||
/* Already there? */
|
||||
if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen)
|
||||
return false;
|
||||
|
||||
if (frozen)
|
||||
set_bit(CGRP_FROZEN, &cgrp->flags);
|
||||
else
|
||||
clear_bit(CGRP_FROZEN, &cgrp->flags);
|
||||
|
||||
cgroup_file_notify(&cgrp->events_file);
|
||||
TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Propagate the cgroup frozen state upwards by the cgroup tree.
|
||||
*/
|
||||
@ -24,24 +46,16 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
|
||||
while ((cgrp = cgroup_parent(cgrp))) {
|
||||
if (frozen) {
|
||||
cgrp->freezer.nr_frozen_descendants += desc;
|
||||
if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
|
||||
test_bit(CGRP_FREEZE, &cgrp->flags) &&
|
||||
cgrp->freezer.nr_frozen_descendants ==
|
||||
cgrp->nr_descendants) {
|
||||
set_bit(CGRP_FROZEN, &cgrp->flags);
|
||||
cgroup_file_notify(&cgrp->events_file);
|
||||
TRACE_CGROUP_PATH(notify_frozen, cgrp, 1);
|
||||
desc++;
|
||||
}
|
||||
if (!test_bit(CGRP_FREEZE, &cgrp->flags) ||
|
||||
(cgrp->freezer.nr_frozen_descendants !=
|
||||
cgrp->nr_descendants))
|
||||
continue;
|
||||
} else {
|
||||
cgrp->freezer.nr_frozen_descendants -= desc;
|
||||
if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
|
||||
clear_bit(CGRP_FROZEN, &cgrp->flags);
|
||||
cgroup_file_notify(&cgrp->events_file);
|
||||
TRACE_CGROUP_PATH(notify_frozen, cgrp, 0);
|
||||
desc++;
|
||||
}
|
||||
}
|
||||
|
||||
if (cgroup_update_frozen_flag(cgrp, frozen))
|
||||
desc++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -53,8 +67,6 @@ void cgroup_update_frozen(struct cgroup *cgrp)
|
||||
{
|
||||
bool frozen;
|
||||
|
||||
lockdep_assert_held(&css_set_lock);
|
||||
|
||||
/*
|
||||
* If the cgroup has to be frozen (CGRP_FREEZE bit set),
|
||||
* and all tasks are frozen and/or stopped, let's consider
|
||||
@ -63,24 +75,9 @@ void cgroup_update_frozen(struct cgroup *cgrp)
|
||||
frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
|
||||
cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
|
||||
|
||||
if (frozen) {
|
||||
/* Already there? */
|
||||
if (test_bit(CGRP_FROZEN, &cgrp->flags))
|
||||
return;
|
||||
|
||||
set_bit(CGRP_FROZEN, &cgrp->flags);
|
||||
} else {
|
||||
/* Already there? */
|
||||
if (!test_bit(CGRP_FROZEN, &cgrp->flags))
|
||||
return;
|
||||
|
||||
clear_bit(CGRP_FROZEN, &cgrp->flags);
|
||||
}
|
||||
cgroup_file_notify(&cgrp->events_file);
|
||||
TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
|
||||
|
||||
/* Update the state of ancestor cgroups. */
|
||||
cgroup_propagate_frozen(cgrp, frozen);
|
||||
/* If flags is updated, update the state of ancestor cgroups. */
|
||||
if (cgroup_update_frozen_flag(cgrp, frozen))
|
||||
cgroup_propagate_frozen(cgrp, frozen);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -260,8 +257,10 @@ void cgroup_freezer_migrate_task(struct task_struct *task,
|
||||
void cgroup_freeze(struct cgroup *cgrp, bool freeze)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cgroup *parent;
|
||||
struct cgroup *dsct;
|
||||
bool applied = false;
|
||||
bool old_e;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
@ -282,22 +281,18 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze)
|
||||
if (cgroup_is_dead(dsct))
|
||||
continue;
|
||||
|
||||
if (freeze) {
|
||||
dsct->freezer.e_freeze++;
|
||||
/*
|
||||
* Already frozen because of ancestor's settings?
|
||||
*/
|
||||
if (dsct->freezer.e_freeze > 1)
|
||||
continue;
|
||||
} else {
|
||||
dsct->freezer.e_freeze--;
|
||||
/*
|
||||
* Still frozen because of ancestor's settings?
|
||||
*/
|
||||
if (dsct->freezer.e_freeze > 0)
|
||||
continue;
|
||||
|
||||
WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
|
||||
/*
|
||||
* e_freeze is affected by parent's e_freeze and dst's freeze.
|
||||
* If old e_freeze eq new e_freeze, no change, its children
|
||||
* will not be affected. So do nothing and skip the subtree
|
||||
*/
|
||||
old_e = dsct->freezer.e_freeze;
|
||||
parent = cgroup_parent(dsct);
|
||||
dsct->freezer.e_freeze = (dsct->freezer.freeze ||
|
||||
parent->freezer.e_freeze);
|
||||
if (dsct->freezer.e_freeze == old_e) {
|
||||
css = css_rightmost_descendant(css);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -444,6 +444,7 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
|
||||
#endif
|
||||
dst_bstat->ntime += src_bstat->ntime;
|
||||
}
|
||||
|
||||
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
|
||||
@ -455,6 +456,7 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
|
||||
#endif
|
||||
dst_bstat->ntime -= src_bstat->ntime;
|
||||
}
|
||||
|
||||
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
|
||||
@ -534,8 +536,10 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
|
||||
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
|
||||
|
||||
switch (index) {
|
||||
case CPUTIME_USER:
|
||||
case CPUTIME_NICE:
|
||||
rstatc->bstat.ntime += delta_exec;
|
||||
fallthrough;
|
||||
case CPUTIME_USER:
|
||||
rstatc->bstat.cputime.utime += delta_exec;
|
||||
break;
|
||||
case CPUTIME_SYSTEM:
|
||||
@ -591,6 +595,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
|
||||
#ifdef CONFIG_SCHED_CORE
|
||||
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
|
||||
#endif
|
||||
bstat->ntime += cpustat[CPUTIME_NICE];
|
||||
}
|
||||
}
|
||||
|
||||
@ -608,13 +613,14 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
|
||||
void cgroup_base_stat_cputime_show(struct seq_file *seq)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
u64 usage, utime, stime;
|
||||
u64 usage, utime, stime, ntime;
|
||||
|
||||
if (cgroup_parent(cgrp)) {
|
||||
cgroup_rstat_flush_hold(cgrp);
|
||||
usage = cgrp->bstat.cputime.sum_exec_runtime;
|
||||
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
|
||||
&utime, &stime);
|
||||
ntime = cgrp->bstat.ntime;
|
||||
cgroup_rstat_flush_release(cgrp);
|
||||
} else {
|
||||
/* cgrp->bstat of root is not actually used, reuse it */
|
||||
@ -622,16 +628,19 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
|
||||
usage = cgrp->bstat.cputime.sum_exec_runtime;
|
||||
utime = cgrp->bstat.cputime.utime;
|
||||
stime = cgrp->bstat.cputime.stime;
|
||||
ntime = cgrp->bstat.ntime;
|
||||
}
|
||||
|
||||
do_div(usage, NSEC_PER_USEC);
|
||||
do_div(utime, NSEC_PER_USEC);
|
||||
do_div(stime, NSEC_PER_USEC);
|
||||
do_div(ntime, NSEC_PER_USEC);
|
||||
|
||||
seq_printf(seq, "usage_usec %llu\n"
|
||||
"user_usec %llu\n"
|
||||
"system_usec %llu\n",
|
||||
usage, utime, stime);
|
||||
"user_usec %llu\n"
|
||||
"system_usec %llu\n"
|
||||
"nice_usec %llu\n",
|
||||
usage, utime, stime, ntime);
|
||||
|
||||
cgroup_force_idle_show(seq, &cgrp->bstat);
|
||||
}
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "../kselftest.h"
|
||||
#include "cgroup_util.h"
|
||||
@ -229,6 +230,79 @@ cleanup:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Creates a nice process that consumes CPU and checks that the elapsed
|
||||
* usertime in the cgroup is close to the expected time.
|
||||
*/
|
||||
static int test_cpucg_nice(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
int status;
|
||||
long user_usec, nice_usec;
|
||||
long usage_seconds = 2;
|
||||
long expected_nice_usec = usage_seconds * USEC_PER_SEC;
|
||||
char *cpucg;
|
||||
pid_t pid;
|
||||
|
||||
cpucg = cg_name(root, "cpucg_test");
|
||||
if (!cpucg)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(cpucg))
|
||||
goto cleanup;
|
||||
|
||||
user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
|
||||
nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
|
||||
if (nice_usec == -1)
|
||||
ret = KSFT_SKIP;
|
||||
if (user_usec != 0 || nice_usec != 0)
|
||||
goto cleanup;
|
||||
|
||||
/*
|
||||
* We fork here to create a new process that can be niced without
|
||||
* polluting the nice value of other selftests
|
||||
*/
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
goto cleanup;
|
||||
} else if (pid == 0) {
|
||||
struct cpu_hog_func_param param = {
|
||||
.nprocs = 1,
|
||||
.ts = {
|
||||
.tv_sec = usage_seconds,
|
||||
.tv_nsec = 0,
|
||||
},
|
||||
.clock_type = CPU_HOG_CLOCK_PROCESS,
|
||||
};
|
||||
char buf[64];
|
||||
snprintf(buf, sizeof(buf), "%d", getpid());
|
||||
if (cg_write(cpucg, "cgroup.procs", buf))
|
||||
goto cleanup;
|
||||
|
||||
/* Try to keep niced CPU usage as constrained to hog_cpu as possible */
|
||||
nice(1);
|
||||
hog_cpus_timed(cpucg, ¶m);
|
||||
exit(0);
|
||||
} else {
|
||||
waitpid(pid, &status, 0);
|
||||
if (!WIFEXITED(status))
|
||||
goto cleanup;
|
||||
|
||||
user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
|
||||
nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
|
||||
if (!values_close(nice_usec, expected_nice_usec, 1))
|
||||
goto cleanup;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cg_destroy(cpucg);
|
||||
free(cpucg);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
run_cpucg_weight_test(
|
||||
const char *root,
|
||||
@ -686,6 +760,7 @@ struct cpucg_test {
|
||||
} tests[] = {
|
||||
T(test_cpucg_subtree_control),
|
||||
T(test_cpucg_stats),
|
||||
T(test_cpucg_nice),
|
||||
T(test_cpucg_weight_overprovisioned),
|
||||
T(test_cpucg_weight_underprovisioned),
|
||||
T(test_cpucg_nested_weight_overprovisioned),
|
||||
|
Loading…
Reference in New Issue
Block a user