Merge branch 'for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cgroup v2 interface is now official. It's no longer hidden behind a devel flag and can be mounted using the new cgroup2 fs type. Unfortunately, cpu v2 interface hasn't made it yet due to the discussion around in-process hierarchical resource distribution and only memory and io controllers can be used on the v2 interface at the moment. - The existing documentation which has always been a bit of mess is relocated under Documentation/cgroup-v1/. Documentation/cgroup-v2.txt is added as the authoritative documentation for the v2 interface. - Some features are added through for-4.5-ancestor-test branch to enable netfilter xt_cgroup match to use cgroup v2 paths. The actual netfilter changes will be merged through the net tree which pulled in the said branch. - Various cleanups * 'for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: rename cgroup documentations cgroup: fix a typo. cgroup: Remove resource_counter.txt in Documentation/cgroup-legacy/00-INDEX. cgroup: demote subsystem init messages to KERN_DEBUG cgroup: Fix uninitialized variable warning cgroup: put controller Kconfig options in meaningful order cgroup: clean up the kernel configuration menu nomenclature cgroup_pids: fix a typo. Subject: cgroup: Fix incomplete dd command in blkio documentation cgroup: kill cgrp_ss_priv[CGROUP_CANFORK_COUNT] and friends cpuset: Replace all instances of time_t with time64_t cgroup: replace unified-hierarchy.txt with a proper cgroup v2 documentation cgroup: rename Documentation/cgroups/ to Documentation/cgroup-legacy/ cgroup: replace __DEVEL__sane_behavior with cgroup2 fs type
This commit is contained in:
@@ -24,7 +24,5 @@ net_prio.txt
|
|||||||
- Network priority cgroups details and usages.
|
- Network priority cgroups details and usages.
|
||||||
pids.txt
|
pids.txt
|
||||||
- Process number cgroups details and usages.
|
- Process number cgroups details and usages.
|
||||||
resource_counter.txt
|
|
||||||
- Resource Counter API.
|
|
||||||
unified-hierarchy.txt
|
unified-hierarchy.txt
|
||||||
- Description the new/next cgroup interface.
|
- Description the new/next cgroup interface.
|
||||||
@@ -84,8 +84,7 @@ Throttling/Upper Limit policy
|
|||||||
|
|
||||||
- Run dd to read a file and see if rate is throttled to 1MB/s or not.
|
- Run dd to read a file and see if rate is throttled to 1MB/s or not.
|
||||||
|
|
||||||
# dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
|
# dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
|
||||||
# iflag=direct
|
|
||||||
1024+0 records in
|
1024+0 records in
|
||||||
1024+0 records out
|
1024+0 records out
|
||||||
4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
|
4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
|
||||||
@@ -374,82 +373,3 @@ One can experience an overall throughput drop if you have created multiple
|
|||||||
groups and put applications in that group which are not driving enough
|
groups and put applications in that group which are not driving enough
|
||||||
IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
|
IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
|
||||||
on individual groups and throughput should improve.
|
on individual groups and throughput should improve.
|
||||||
|
|
||||||
Writeback
|
|
||||||
=========
|
|
||||||
|
|
||||||
Page cache is dirtied through buffered writes and shared mmaps and
|
|
||||||
written asynchronously to the backing filesystem by the writeback
|
|
||||||
mechanism. Writeback sits between the memory and IO domains and
|
|
||||||
regulates the proportion of dirty memory by balancing dirtying and
|
|
||||||
write IOs.
|
|
||||||
|
|
||||||
On traditional cgroup hierarchies, relationships between different
|
|
||||||
controllers cannot be established making it impossible for writeback
|
|
||||||
to operate accounting for cgroup resource restrictions and all
|
|
||||||
writeback IOs are attributed to the root cgroup.
|
|
||||||
|
|
||||||
If both the blkio and memory controllers are used on the v2 hierarchy
|
|
||||||
and the filesystem supports cgroup writeback, writeback operations
|
|
||||||
correctly follow the resource restrictions imposed by both memory and
|
|
||||||
blkio controllers.
|
|
||||||
|
|
||||||
Writeback examines both system-wide and per-cgroup dirty memory status
|
|
||||||
and enforces the more restrictive of the two. Also, writeback control
|
|
||||||
parameters which are absolute values - vm.dirty_bytes and
|
|
||||||
vm.dirty_background_bytes - are distributed across cgroups according
|
|
||||||
to their current writeback bandwidth.
|
|
||||||
|
|
||||||
There's a peculiarity stemming from the discrepancy in ownership
|
|
||||||
granularity between memory controller and writeback. While memory
|
|
||||||
controller tracks ownership per page, writeback operates on inode
|
|
||||||
basis. cgroup writeback bridges the gap by tracking ownership by
|
|
||||||
inode but migrating ownership if too many foreign pages, pages which
|
|
||||||
don't match the current inode ownership, have been encountered while
|
|
||||||
writing back the inode.
|
|
||||||
|
|
||||||
This is a conscious design choice as writeback operations are
|
|
||||||
inherently tied to inodes making strictly following page ownership
|
|
||||||
complicated and inefficient. The only use case which suffers from
|
|
||||||
this compromise is multiple cgroups concurrently dirtying disjoint
|
|
||||||
regions of the same inode, which is an unlikely use case and decided
|
|
||||||
to be unsupported. Note that as memory controller assigns page
|
|
||||||
ownership on the first use and doesn't update it until the page is
|
|
||||||
released, even if cgroup writeback strictly follows page ownership,
|
|
||||||
multiple cgroups dirtying overlapping areas wouldn't work as expected.
|
|
||||||
In general, write-sharing an inode across multiple cgroups is not well
|
|
||||||
supported.
|
|
||||||
|
|
||||||
Filesystem support for cgroup writeback
|
|
||||||
---------------------------------------
|
|
||||||
|
|
||||||
A filesystem can make writeback IOs cgroup-aware by updating
|
|
||||||
address_space_operations->writepage[s]() to annotate bio's using the
|
|
||||||
following two functions.
|
|
||||||
|
|
||||||
* wbc_init_bio(@wbc, @bio)
|
|
||||||
|
|
||||||
Should be called for each bio carrying writeback data and associates
|
|
||||||
the bio with the inode's owner cgroup. Can be called anytime
|
|
||||||
between bio allocation and submission.
|
|
||||||
|
|
||||||
* wbc_account_io(@wbc, @page, @bytes)
|
|
||||||
|
|
||||||
Should be called for each data segment being written out. While
|
|
||||||
this function doesn't care exactly when it's called during the
|
|
||||||
writeback session, it's the easiest and most natural to call it as
|
|
||||||
data segments are added to a bio.
|
|
||||||
|
|
||||||
With writeback bio's annotated, cgroup support can be enabled per
|
|
||||||
super_block by setting MS_CGROUPWB in ->s_flags. This allows for
|
|
||||||
selective disabling of cgroup writeback support which is helpful when
|
|
||||||
certain filesystem features, e.g. journaled data mode, are
|
|
||||||
incompatible.
|
|
||||||
|
|
||||||
wbc_init_bio() binds the specified bio to its cgroup. Depending on
|
|
||||||
the configuration, the bio may be executed at a lower priority and if
|
|
||||||
the writeback session is holding shared resources, e.g. a journal
|
|
||||||
entry, may lead to priority inversion. There is no one easy solution
|
|
||||||
for the problem. Filesystems can try to work around specific problem
|
|
||||||
cases by skipping wbc_init_bio() or using bio_associate_blkcg()
|
|
||||||
directly.
|
|
||||||
1293
Documentation/cgroup-v2.txt
Normal file
1293
Documentation/cgroup-v2.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,647 +0,0 @@
|
|||||||
|
|
||||||
Cgroup unified hierarchy
|
|
||||||
|
|
||||||
April, 2014 Tejun Heo <tj@kernel.org>
|
|
||||||
|
|
||||||
This document describes the changes made by unified hierarchy and
|
|
||||||
their rationales. It will eventually be merged into the main cgroup
|
|
||||||
documentation.
|
|
||||||
|
|
||||||
CONTENTS
|
|
||||||
|
|
||||||
1. Background
|
|
||||||
2. Basic Operation
|
|
||||||
2-1. Mounting
|
|
||||||
2-2. cgroup.subtree_control
|
|
||||||
2-3. cgroup.controllers
|
|
||||||
3. Structural Constraints
|
|
||||||
3-1. Top-down
|
|
||||||
3-2. No internal tasks
|
|
||||||
4. Delegation
|
|
||||||
4-1. Model of delegation
|
|
||||||
4-2. Common ancestor rule
|
|
||||||
5. Other Changes
|
|
||||||
5-1. [Un]populated Notification
|
|
||||||
5-2. Other Core Changes
|
|
||||||
5-3. Controller File Conventions
|
|
||||||
5-3-1. Format
|
|
||||||
5-3-2. Control Knobs
|
|
||||||
5-4. Per-Controller Changes
|
|
||||||
5-4-1. io
|
|
||||||
5-4-2. cpuset
|
|
||||||
5-4-3. memory
|
|
||||||
6. Planned Changes
|
|
||||||
6-1. CAP for resource control
|
|
||||||
|
|
||||||
|
|
||||||
1. Background
|
|
||||||
|
|
||||||
cgroup allows an arbitrary number of hierarchies and each hierarchy
|
|
||||||
can host any number of controllers. While this seems to provide a
|
|
||||||
high level of flexibility, it isn't quite useful in practice.
|
|
||||||
|
|
||||||
For example, as there is only one instance of each controller, utility
|
|
||||||
type controllers such as freezer which can be useful in all
|
|
||||||
hierarchies can only be used in one. The issue is exacerbated by the
|
|
||||||
fact that controllers can't be moved around once hierarchies are
|
|
||||||
populated. Another issue is that all controllers bound to a hierarchy
|
|
||||||
are forced to have exactly the same view of the hierarchy. It isn't
|
|
||||||
possible to vary the granularity depending on the specific controller.
|
|
||||||
|
|
||||||
In practice, these issues heavily limit which controllers can be put
|
|
||||||
on the same hierarchy and most configurations resort to putting each
|
|
||||||
controller on its own hierarchy. Only closely related ones, such as
|
|
||||||
the cpu and cpuacct controllers, make sense to put on the same
|
|
||||||
hierarchy. This often means that userland ends up managing multiple
|
|
||||||
similar hierarchies repeating the same steps on each hierarchy
|
|
||||||
whenever a hierarchy management operation is necessary.
|
|
||||||
|
|
||||||
Unfortunately, support for multiple hierarchies comes at a steep cost.
|
|
||||||
Internal implementation in cgroup core proper is dazzlingly
|
|
||||||
complicated but more importantly the support for multiple hierarchies
|
|
||||||
restricts how cgroup is used in general and what controllers can do.
|
|
||||||
|
|
||||||
There's no limit on how many hierarchies there may be, which means
|
|
||||||
that a task's cgroup membership can't be described in finite length.
|
|
||||||
The key may contain any varying number of entries and is unlimited in
|
|
||||||
length, which makes it highly awkward to handle and leads to addition
|
|
||||||
of controllers which exist only to identify membership, which in turn
|
|
||||||
exacerbates the original problem.
|
|
||||||
|
|
||||||
Also, as a controller can't have any expectation regarding what shape
|
|
||||||
of hierarchies other controllers would be on, each controller has to
|
|
||||||
assume that all other controllers are operating on completely
|
|
||||||
orthogonal hierarchies. This makes it impossible, or at least very
|
|
||||||
cumbersome, for controllers to cooperate with each other.
|
|
||||||
|
|
||||||
In most use cases, putting controllers on hierarchies which are
|
|
||||||
completely orthogonal to each other isn't necessary. What usually is
|
|
||||||
called for is the ability to have differing levels of granularity
|
|
||||||
depending on the specific controller. In other words, hierarchy may
|
|
||||||
be collapsed from leaf towards root when viewed from specific
|
|
||||||
controllers. For example, a given configuration might not care about
|
|
||||||
how memory is distributed beyond a certain level while still wanting
|
|
||||||
to control how CPU cycles are distributed.
|
|
||||||
|
|
||||||
Unified hierarchy is the next version of cgroup interface. It aims to
|
|
||||||
address the aforementioned issues by having more structure while
|
|
||||||
retaining enough flexibility for most use cases. Various other
|
|
||||||
general and controller-specific interface issues are also addressed in
|
|
||||||
the process.
|
|
||||||
|
|
||||||
|
|
||||||
2. Basic Operation
|
|
||||||
|
|
||||||
2-1. Mounting
|
|
||||||
|
|
||||||
Currently, unified hierarchy can be mounted with the following mount
|
|
||||||
command. Note that this is still under development and scheduled to
|
|
||||||
change soon.
|
|
||||||
|
|
||||||
mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT
|
|
||||||
|
|
||||||
All controllers which support the unified hierarchy and are not bound
|
|
||||||
to other hierarchies are automatically bound to unified hierarchy and
|
|
||||||
show up at the root of it. Controllers which are enabled only in the
|
|
||||||
root of unified hierarchy can be bound to other hierarchies. This
|
|
||||||
allows mixing unified hierarchy with the traditional multiple
|
|
||||||
hierarchies in a fully backward compatible way.
|
|
||||||
|
|
||||||
A controller can be moved across hierarchies only after the controller
|
|
||||||
is no longer referenced in its current hierarchy. Because per-cgroup
|
|
||||||
controller states are destroyed asynchronously and controllers may
|
|
||||||
have lingering references, a controller may not show up immediately on
|
|
||||||
the unified hierarchy after the final umount of the previous
|
|
||||||
hierarchy. Similarly, a controller should be fully disabled to be
|
|
||||||
moved out of the unified hierarchy and it may take some time for the
|
|
||||||
disabled controller to become available for other hierarchies;
|
|
||||||
furthermore, due to dependencies among controllers, other controllers
|
|
||||||
may need to be disabled too.
|
|
||||||
|
|
||||||
While useful for development and manual configurations, dynamically
|
|
||||||
moving controllers between the unified and other hierarchies is
|
|
||||||
strongly discouraged for production use. It is recommended to decide
|
|
||||||
the hierarchies and controller associations before starting using the
|
|
||||||
controllers.
|
|
||||||
|
|
||||||
|
|
||||||
2-2. cgroup.subtree_control
|
|
||||||
|
|
||||||
All cgroups on unified hierarchy have a "cgroup.subtree_control" file
|
|
||||||
which governs which controllers are enabled on the children of the
|
|
||||||
cgroup. Let's assume a hierarchy like the following.
|
|
||||||
|
|
||||||
root - A - B - C
|
|
||||||
\ D
|
|
||||||
|
|
||||||
root's "cgroup.subtree_control" file determines which controllers are
|
|
||||||
enabled on A. A's on B. B's on C and D. This coincides with the
|
|
||||||
fact that controllers on the immediate sub-level are used to
|
|
||||||
distribute the resources of the parent. In fact, it's natural to
|
|
||||||
assume that resource control knobs of a child belong to its parent.
|
|
||||||
Enabling a controller in a "cgroup.subtree_control" file declares that
|
|
||||||
distribution of the respective resources of the cgroup will be
|
|
||||||
controlled. Note that this means that controller enable states are
|
|
||||||
shared among siblings.
|
|
||||||
|
|
||||||
When read, the file contains a space-separated list of currently
|
|
||||||
enabled controllers. A write to the file should contain a
|
|
||||||
space-separated list of controllers with '+' or '-' prefixed (without
|
|
||||||
the quotes). Controllers prefixed with '+' are enabled and '-'
|
|
||||||
disabled. If a controller is listed multiple times, the last entry
|
|
||||||
wins. The specific operations are executed atomically - either all
|
|
||||||
succeed or fail.
|
|
||||||
|
|
||||||
|
|
||||||
2-3. cgroup.controllers
|
|
||||||
|
|
||||||
Read-only "cgroup.controllers" file contains a space-separated list of
|
|
||||||
controllers which can be enabled in the cgroup's
|
|
||||||
"cgroup.subtree_control" file.
|
|
||||||
|
|
||||||
In the root cgroup, this lists controllers which are not bound to
|
|
||||||
other hierarchies and the content changes as controllers are bound to
|
|
||||||
and unbound from other hierarchies.
|
|
||||||
|
|
||||||
In non-root cgroups, the content of this file equals that of the
|
|
||||||
parent's "cgroup.subtree_control" file as only controllers enabled
|
|
||||||
from the parent can be used in its children.
|
|
||||||
|
|
||||||
|
|
||||||
3. Structural Constraints
|
|
||||||
|
|
||||||
3-1. Top-down
|
|
||||||
|
|
||||||
As it doesn't make sense to nest control of an uncontrolled resource,
|
|
||||||
all non-root "cgroup.subtree_control" files can only contain
|
|
||||||
controllers which are enabled in the parent's "cgroup.subtree_control"
|
|
||||||
file. A controller can be enabled only if the parent has the
|
|
||||||
controller enabled and a controller can't be disabled if one or more
|
|
||||||
children have it enabled.
|
|
||||||
|
|
||||||
|
|
||||||
3-2. No internal tasks
|
|
||||||
|
|
||||||
One long-standing issue that cgroup faces is the competition between
|
|
||||||
tasks belonging to the parent cgroup and its children cgroups. This
|
|
||||||
is inherently nasty as two different types of entities compete and
|
|
||||||
there is no agreed-upon obvious way to handle it. Different
|
|
||||||
controllers are doing different things.
|
|
||||||
|
|
||||||
The cpu controller considers tasks and cgroups as equivalents and maps
|
|
||||||
nice levels to cgroup weights. This works for some cases but falls
|
|
||||||
flat when children should be allocated specific ratios of CPU cycles
|
|
||||||
and the number of internal tasks fluctuates - the ratios constantly
|
|
||||||
change as the number of competing entities fluctuates. There also are
|
|
||||||
other issues. The mapping from nice level to weight isn't obvious or
|
|
||||||
universal, and there are various other knobs which simply aren't
|
|
||||||
available for tasks.
|
|
||||||
|
|
||||||
The io controller implicitly creates a hidden leaf node for each
|
|
||||||
cgroup to host the tasks. The hidden leaf has its own copies of all
|
|
||||||
the knobs with "leaf_" prefixed. While this allows equivalent control
|
|
||||||
over internal tasks, it's with serious drawbacks. It always adds an
|
|
||||||
extra layer of nesting which may not be necessary, makes the interface
|
|
||||||
messy and significantly complicates the implementation.
|
|
||||||
|
|
||||||
The memory controller currently doesn't have a way to control what
|
|
||||||
happens between internal tasks and child cgroups and the behavior is
|
|
||||||
not clearly defined. There have been attempts to add ad-hoc behaviors
|
|
||||||
and knobs to tailor the behavior to specific workloads. Continuing
|
|
||||||
this direction will lead to problems which will be extremely difficult
|
|
||||||
to resolve in the long term.
|
|
||||||
|
|
||||||
Multiple controllers struggle with internal tasks and came up with
|
|
||||||
different ways to deal with it; unfortunately, all the approaches in
|
|
||||||
use now are severely flawed and, furthermore, the widely different
|
|
||||||
behaviors make cgroup as whole highly inconsistent.
|
|
||||||
|
|
||||||
It is clear that this is something which needs to be addressed from
|
|
||||||
cgroup core proper in a uniform way so that controllers don't need to
|
|
||||||
worry about it and cgroup as a whole shows a consistent and logical
|
|
||||||
behavior. To achieve that, unified hierarchy enforces the following
|
|
||||||
structural constraint:
|
|
||||||
|
|
||||||
Except for the root, only cgroups which don't contain any task may
|
|
||||||
have controllers enabled in their "cgroup.subtree_control" files.
|
|
||||||
|
|
||||||
Combined with other properties, this guarantees that, when a
|
|
||||||
controller is looking at the part of the hierarchy which has it
|
|
||||||
enabled, tasks are always only on the leaves. This rules out
|
|
||||||
situations where child cgroups compete against internal tasks of the
|
|
||||||
parent.
|
|
||||||
|
|
||||||
There are two things to note. Firstly, the root cgroup is exempt from
|
|
||||||
the restriction. Root contains tasks and anonymous resource
|
|
||||||
consumption which can't be associated with any other cgroup and
|
|
||||||
requires special treatment from most controllers. How resource
|
|
||||||
consumption in the root cgroup is governed is up to each controller.
|
|
||||||
|
|
||||||
Secondly, the restriction doesn't take effect if there is no enabled
|
|
||||||
controller in the cgroup's "cgroup.subtree_control" file. This is
|
|
||||||
important as otherwise it wouldn't be possible to create children of a
|
|
||||||
populated cgroup. To control resource distribution of a cgroup, the
|
|
||||||
cgroup must create children and transfer all its tasks to the children
|
|
||||||
before enabling controllers in its "cgroup.subtree_control" file.
|
|
||||||
|
|
||||||
|
|
||||||
4. Delegation
|
|
||||||
|
|
||||||
4-1. Model of delegation
|
|
||||||
|
|
||||||
A cgroup can be delegated to a less privileged user by granting write
|
|
||||||
access of the directory and its "cgroup.procs" file to the user. Note
|
|
||||||
that the resource control knobs in a given directory concern the
|
|
||||||
resources of the parent and thus must not be delegated along with the
|
|
||||||
directory.
|
|
||||||
|
|
||||||
Once delegated, the user can build sub-hierarchy under the directory,
|
|
||||||
organize processes as it sees fit and further distribute the resources
|
|
||||||
it got from the parent. The limits and other settings of all resource
|
|
||||||
controllers are hierarchical and regardless of what happens in the
|
|
||||||
delegated sub-hierarchy, nothing can escape the resource restrictions
|
|
||||||
imposed by the parent.
|
|
||||||
|
|
||||||
Currently, cgroup doesn't impose any restrictions on the number of
|
|
||||||
cgroups in or nesting depth of a delegated sub-hierarchy; however,
|
|
||||||
this may in the future be limited explicitly.
|
|
||||||
|
|
||||||
|
|
||||||
4-2. Common ancestor rule
|
|
||||||
|
|
||||||
On the unified hierarchy, to write to a "cgroup.procs" file, in
|
|
||||||
addition to the usual write permission to the file and uid match, the
|
|
||||||
writer must also have write access to the "cgroup.procs" file of the
|
|
||||||
common ancestor of the source and destination cgroups. This prevents
|
|
||||||
delegatees from smuggling processes across disjoint sub-hierarchies.
|
|
||||||
|
|
||||||
Let's say cgroups C0 and C1 have been delegated to user U0 who created
|
|
||||||
C00, C01 under C0 and C10 under C1 as follows.
|
|
||||||
|
|
||||||
~~~~~~~~~~~~~ - C0 - C00
|
|
||||||
~ cgroup ~ \ C01
|
|
||||||
~ hierarchy ~
|
|
||||||
~~~~~~~~~~~~~ - C1 - C10
|
|
||||||
|
|
||||||
C0 and C1 are separate entities in terms of resource distribution
|
|
||||||
regardless of their relative positions in the hierarchy. The
|
|
||||||
resources the processes under C0 are entitled to are controlled by
|
|
||||||
C0's ancestors and may be completely different from C1. It's clear
|
|
||||||
that the intention of delegating C0 to U0 is allowing U0 to organize
|
|
||||||
the processes under C0 and further control the distribution of C0's
|
|
||||||
resources.
|
|
||||||
|
|
||||||
On traditional hierarchies, if a task has write access to "tasks" or
|
|
||||||
"cgroup.procs" file of a cgroup and its uid agrees with the target, it
|
|
||||||
can move the target to the cgroup. In the above example, U0 will not
|
|
||||||
only be able to move processes in each sub-hierarchy but also across
|
|
||||||
the two sub-hierarchies, effectively allowing it to violate the
|
|
||||||
organizational and resource restrictions implied by the hierarchical
|
|
||||||
structure above C0 and C1.
|
|
||||||
|
|
||||||
On the unified hierarchy, let's say U0 wants to write the pid of a
|
|
||||||
process which has a matching uid and is currently in C10 into
|
|
||||||
"C00/cgroup.procs". U0 obviously has write access to the file and
|
|
||||||
migration permission on the process; however, the common ancestor of
|
|
||||||
the source cgroup C10 and the destination cgroup C00 is above the
|
|
||||||
points of delegation and U0 would not have write access to its
|
|
||||||
"cgroup.procs" and thus be denied with -EACCES.
|
|
||||||
|
|
||||||
|
|
||||||
5. Other Changes
|
|
||||||
|
|
||||||
5-1. [Un]populated Notification
|
|
||||||
|
|
||||||
cgroup users often need a way to determine when a cgroup's
|
|
||||||
subhierarchy becomes empty so that it can be cleaned up. cgroup
|
|
||||||
currently provides release_agent for it; unfortunately, this mechanism
|
|
||||||
is riddled with issues.
|
|
||||||
|
|
||||||
- It delivers events by forking and execing a userland binary
|
|
||||||
specified as the release_agent. This is a long deprecated method of
|
|
||||||
notification delivery. It's extremely heavy, slow and cumbersome to
|
|
||||||
integrate with larger infrastructure.
|
|
||||||
|
|
||||||
- There is single monitoring point at the root. There's no way to
|
|
||||||
delegate management of a subtree.
|
|
||||||
|
|
||||||
- The event isn't recursive. It triggers when a cgroup doesn't have
|
|
||||||
any tasks or child cgroups. Events for internal nodes trigger only
|
|
||||||
after all children are removed. This again makes it impossible to
|
|
||||||
delegate management of a subtree.
|
|
||||||
|
|
||||||
- Events are filtered from the kernel side. A "notify_on_release"
|
|
||||||
file is used to subscribe to or suppress release events. This is
|
|
||||||
unnecessarily complicated and probably done this way because event
|
|
||||||
delivery itself was expensive.
|
|
||||||
|
|
||||||
Unified hierarchy implements "populated" field in "cgroup.events"
|
|
||||||
interface file which can be used to monitor whether the cgroup's
|
|
||||||
subhierarchy has tasks in it or not. Its value is 0 if there is no
|
|
||||||
task in the cgroup and its descendants; otherwise, 1. poll and
|
|
||||||
[id]notify events are triggered when the value changes.
|
|
||||||
|
|
||||||
This is significantly lighter and simpler and trivially allows
|
|
||||||
delegating management of subhierarchy - subhierarchy monitoring can
|
|
||||||
block further propagation simply by putting itself or another process
|
|
||||||
in the subhierarchy and monitor events that it's interested in from
|
|
||||||
there without interfering with monitoring higher in the tree.
|
|
||||||
|
|
||||||
In unified hierarchy, the release_agent mechanism is no longer
|
|
||||||
supported and the interface files "release_agent" and
|
|
||||||
"notify_on_release" do not exist.
|
|
||||||
|
|
||||||
|
|
||||||
5-2. Other Core Changes
|
|
||||||
|
|
||||||
- None of the mount options is allowed.
|
|
||||||
|
|
||||||
- remount is disallowed.
|
|
||||||
|
|
||||||
- rename(2) is disallowed.
|
|
||||||
|
|
||||||
- The "tasks" file is removed. Everything should at process
|
|
||||||
granularity. Use the "cgroup.procs" file instead.
|
|
||||||
|
|
||||||
- The "cgroup.procs" file is not sorted. pids will be unique unless
|
|
||||||
they got recycled in-between reads.
|
|
||||||
|
|
||||||
- The "cgroup.clone_children" file is removed.
|
|
||||||
|
|
||||||
- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
|
|
||||||
to before exiting. If the cgroup is removed before the zombie is
|
|
||||||
reaped, " (deleted)" is appeneded to the path.
|
|
||||||
|
|
||||||
|
|
||||||
5-3. Controller File Conventions
|
|
||||||
|
|
||||||
5-3-1. Format
|
|
||||||
|
|
||||||
In general, all controller files should be in one of the following
|
|
||||||
formats whenever possible.
|
|
||||||
|
|
||||||
- Values only files
|
|
||||||
|
|
||||||
VAL0 VAL1...\n
|
|
||||||
|
|
||||||
- Flat keyed files
|
|
||||||
|
|
||||||
KEY0 VAL0\n
|
|
||||||
KEY1 VAL1\n
|
|
||||||
...
|
|
||||||
|
|
||||||
- Nested keyed files
|
|
||||||
|
|
||||||
KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
|
|
||||||
KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
|
|
||||||
...
|
|
||||||
|
|
||||||
For a writeable file, the format for writing should generally match
|
|
||||||
reading; however, controllers may allow omitting later fields or
|
|
||||||
implement restricted shortcuts for most common use cases.
|
|
||||||
|
|
||||||
For both flat and nested keyed files, only the values for a single key
|
|
||||||
can be written at a time. For nested keyed files, the sub key pairs
|
|
||||||
may be specified in any order and not all pairs have to be specified.
|
|
||||||
|
|
||||||
|
|
||||||
5-3-2. Control Knobs
|
|
||||||
|
|
||||||
- Settings for a single feature should generally be implemented in a
|
|
||||||
single file.
|
|
||||||
|
|
||||||
- In general, the root cgroup should be exempt from resource control
|
|
||||||
and thus shouldn't have resource control knobs.
|
|
||||||
|
|
||||||
- If a controller implements ratio based resource distribution, the
|
|
||||||
control knob should be named "weight" and have the range [1, 10000]
|
|
||||||
and 100 should be the default value. The values are chosen to allow
|
|
||||||
enough and symmetric bias in both directions while keeping it
|
|
||||||
intuitive (the default is 100%).
|
|
||||||
|
|
||||||
- If a controller implements an absolute resource guarantee and/or
|
|
||||||
limit, the control knobs should be named "min" and "max"
|
|
||||||
respectively. If a controller implements best effort resource
|
|
||||||
gurantee and/or limit, the control knobs should be named "low" and
|
|
||||||
"high" respectively.
|
|
||||||
|
|
||||||
In the above four control files, the special token "max" should be
|
|
||||||
used to represent upward infinity for both reading and writing.
|
|
||||||
|
|
||||||
- If a setting has configurable default value and specific overrides,
|
|
||||||
the default settings should be keyed with "default" and appear as
|
|
||||||
the first entry in the file. Specific entries can use "default" as
|
|
||||||
its value to indicate inheritance of the default value.
|
|
||||||
|
|
||||||
- For events which are not very high frequency, an interface file
|
|
||||||
"events" should be created which lists event key value pairs.
|
|
||||||
Whenever a notifiable event happens, file modified event should be
|
|
||||||
generated on the file.
|
|
||||||
|
|
||||||
|
|
||||||
5-4. Per-Controller Changes
|
|
||||||
|
|
||||||
5-4-1. io
|
|
||||||
|
|
||||||
- blkio is renamed to io. The interface is overhauled anyway. The
|
|
||||||
new name is more in line with the other two major controllers, cpu
|
|
||||||
and memory, and better suited given that it may be used for cgroup
|
|
||||||
writeback without involving block layer.
|
|
||||||
|
|
||||||
- Everything including stat is always hierarchical making separate
|
|
||||||
recursive stat files pointless and, as no internal node can have
|
|
||||||
tasks, leaf weights are meaningless. The operation model is
|
|
||||||
simplified and the interface is overhauled accordingly.
|
|
||||||
|
|
||||||
io.stat
|
|
||||||
|
|
||||||
The stat file. The reported stats are from the point where
|
|
||||||
bio's are issued to request_queue. The stats are counted
|
|
||||||
independent of which policies are enabled. Each line in the
|
|
||||||
file follows the following format. More fields may later be
|
|
||||||
added at the end.
|
|
||||||
|
|
||||||
$MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
|
|
||||||
|
|
||||||
io.weight
|
|
||||||
|
|
||||||
The weight setting, currently only available and effective if
|
|
||||||
cfq-iosched is in use for the target device. The weight is
|
|
||||||
between 1 and 10000 and defaults to 100. The first line
|
|
||||||
always contains the default weight in the following format to
|
|
||||||
use when per-device setting is missing.
|
|
||||||
|
|
||||||
default $WEIGHT
|
|
||||||
|
|
||||||
Subsequent lines list per-device weights of the following
|
|
||||||
format.
|
|
||||||
|
|
||||||
$MAJ:$MIN $WEIGHT
|
|
||||||
|
|
||||||
Writing "$WEIGHT" or "default $WEIGHT" changes the default
|
|
||||||
setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
|
|
||||||
while "$MAJ:$MIN default" clears it.
|
|
||||||
|
|
||||||
This file is available only on non-root cgroups.
|
|
||||||
|
|
||||||
io.max
|
|
||||||
|
|
||||||
The maximum bandwidth and/or iops setting, only available if
|
|
||||||
blk-throttle is enabled. The file is of the following format.
|
|
||||||
|
|
||||||
$MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
|
|
||||||
|
|
||||||
${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
|
|
||||||
read/write IOs per second. "max" indicates no limit. Writing
|
|
||||||
to the file follows the same format but the individual
|
|
||||||
settings may be omitted or specified in any order.
|
|
||||||
|
|
||||||
This file is available only on non-root cgroups.
|
|
||||||
|
|
||||||
|
|
||||||
5-4-2. cpuset
|
|
||||||
|
|
||||||
- Tasks are kept in empty cpusets after hotplug and take on the masks
|
|
||||||
of the nearest non-empty ancestor, instead of being moved to it.
|
|
||||||
|
|
||||||
- A task can be moved into an empty cpuset, and again it takes on the
|
|
||||||
masks of the nearest non-empty ancestor.
|
|
||||||
|
|
||||||
|
|
||||||
5-4-3. memory
|
|
||||||
|
|
||||||
- use_hierarchy is on by default and the cgroup file for the flag is
|
|
||||||
not created.
|
|
||||||
|
|
||||||
- The original lower boundary, the soft limit, is defined as a limit
|
|
||||||
that is per default unset. As a result, the set of cgroups that
|
|
||||||
global reclaim prefers is opt-in, rather than opt-out. The costs
|
|
||||||
for optimizing these mostly negative lookups are so high that the
|
|
||||||
implementation, despite its enormous size, does not even provide the
|
|
||||||
basic desirable behavior. First off, the soft limit has no
|
|
||||||
hierarchical meaning. All configured groups are organized in a
|
|
||||||
global rbtree and treated like equal peers, regardless where they
|
|
||||||
are located in the hierarchy. This makes subtree delegation
|
|
||||||
impossible. Second, the soft limit reclaim pass is so aggressive
|
|
||||||
that it not just introduces high allocation latencies into the
|
|
||||||
system, but also impacts system performance due to overreclaim, to
|
|
||||||
the point where the feature becomes self-defeating.
|
|
||||||
|
|
||||||
The memory.low boundary on the other hand is a top-down allocated
|
|
||||||
reserve. A cgroup enjoys reclaim protection when it and all its
|
|
||||||
ancestors are below their low boundaries, which makes delegation of
|
|
||||||
subtrees possible. Secondly, new cgroups have no reserve per
|
|
||||||
default and in the common case most cgroups are eligible for the
|
|
||||||
preferred reclaim pass. This allows the new low boundary to be
|
|
||||||
efficiently implemented with just a minor addition to the generic
|
|
||||||
reclaim code, without the need for out-of-band data structures and
|
|
||||||
reclaim passes. Because the generic reclaim code considers all
|
|
||||||
cgroups except for the ones running low in the preferred first
|
|
||||||
reclaim pass, overreclaim of individual groups is eliminated as
|
|
||||||
well, resulting in much better overall workload performance.
|
|
||||||
|
|
||||||
- The original high boundary, the hard limit, is defined as a strict
|
|
||||||
limit that can not budge, even if the OOM killer has to be called.
|
|
||||||
But this generally goes against the goal of making the most out of
|
|
||||||
the available memory. The memory consumption of workloads varies
|
|
||||||
during runtime, and that requires users to overcommit. But doing
|
|
||||||
that with a strict upper limit requires either a fairly accurate
|
|
||||||
prediction of the working set size or adding slack to the limit.
|
|
||||||
Since working set size estimation is hard and error prone, and
|
|
||||||
getting it wrong results in OOM kills, most users tend to err on the
|
|
||||||
side of a looser limit and end up wasting precious resources.
|
|
||||||
|
|
||||||
The memory.high boundary on the other hand can be set much more
|
|
||||||
conservatively. When hit, it throttles allocations by forcing them
|
|
||||||
into direct reclaim to work off the excess, but it never invokes the
|
|
||||||
OOM killer. As a result, a high boundary that is chosen too
|
|
||||||
aggressively will not terminate the processes, but instead it will
|
|
||||||
lead to gradual performance degradation. The user can monitor this
|
|
||||||
and make corrections until the minimal memory footprint that still
|
|
||||||
gives acceptable performance is found.
|
|
||||||
|
|
||||||
In extreme cases, with many concurrent allocations and a complete
|
|
||||||
breakdown of reclaim progress within the group, the high boundary
|
|
||||||
can be exceeded. But even then it's mostly better to satisfy the
|
|
||||||
allocation from the slack available in other groups or the rest of
|
|
||||||
the system than killing the group. Otherwise, memory.max is there
|
|
||||||
to limit this type of spillover and ultimately contain buggy or even
|
|
||||||
malicious applications.
|
|
||||||
|
|
||||||
- The original control file names are unwieldy and inconsistent in
|
|
||||||
many different ways. For example, the upper boundary hit count is
|
|
||||||
exported in the memory.failcnt file, but an OOM event count has to
|
|
||||||
be manually counted by listening to memory.oom_control events, and
|
|
||||||
lower boundary / soft limit events have to be counted by first
|
|
||||||
setting a threshold for that value and then counting those events.
|
|
||||||
Also, usage and limit files encode their units in the filename.
|
|
||||||
That makes the filenames very long, even though this is not
|
|
||||||
information that a user needs to be reminded of every time they type
|
|
||||||
out those names.
|
|
||||||
|
|
||||||
To address these naming issues, as well as to signal clearly that
|
|
||||||
the new interface carries a new configuration model, the naming
|
|
||||||
conventions in it necessarily differ from the old interface.
|
|
||||||
|
|
||||||
- The original limit files indicate the state of an unset limit with a
|
|
||||||
Very High Number, and a configured limit can be unset by echoing -1
|
|
||||||
into those files. But that very high number is implementation and
|
|
||||||
architecture dependent and not very descriptive. And while -1 can
|
|
||||||
be understood as an underflow into the highest possible value, -2 or
|
|
||||||
-10M etc. do not work, so it's not consistent.
|
|
||||||
|
|
||||||
memory.low, memory.high, and memory.max will use the string "max" to
|
|
||||||
indicate and set the highest possible value.
|
|
||||||
|
|
||||||
6. Planned Changes
|
|
||||||
|
|
||||||
6-1. CAP for resource control
|
|
||||||
|
|
||||||
Unified hierarchy will require one of the capabilities(7), which is
|
|
||||||
yet to be decided, for all resource control related knobs. Process
|
|
||||||
organization operations - creation of sub-cgroups and migration of
|
|
||||||
processes in sub-hierarchies may be delegated by changing the
|
|
||||||
ownership and/or permissions on the cgroup directory and
|
|
||||||
"cgroup.procs" interface file; however, all operations which affect
|
|
||||||
resource control - writes to a "cgroup.subtree_control" file or any
|
|
||||||
controller-specific knobs - will require an explicit CAP privilege.
|
|
||||||
|
|
||||||
This, in part, is to prevent the cgroup interface from being
|
|
||||||
inadvertently promoted to programmable API used by non-privileged
|
|
||||||
binaries. cgroup exposes various aspects of the system in ways which
|
|
||||||
aren't properly abstracted for direct consumption by regular programs.
|
|
||||||
This is an administration interface much closer to sysctl knobs than
|
|
||||||
system calls. Even the basic access model, being filesystem path
|
|
||||||
based, isn't suitable for direct consumption. There's no way to
|
|
||||||
access "my cgroup" in a race-free way or make multiple operations
|
|
||||||
atomic against migration to another cgroup.
|
|
||||||
|
|
||||||
Another aspect is that, for better or for worse, the cgroup interface
|
|
||||||
goes through far less scrutiny than regular interfaces for
|
|
||||||
unprivileged userland. The upside is that cgroup is able to expose
|
|
||||||
useful features which may not be suitable for general consumption in a
|
|
||||||
reasonable time frame. It provides a relatively short path between
|
|
||||||
internal details and userland-visible interface. Of course, this
|
|
||||||
shortcut comes with high risk. We go through what we go through for
|
|
||||||
general kernel APIs for good reasons. It may end up leaking internal
|
|
||||||
details in a way which can exert significant pain by locking the
|
|
||||||
kernel into a contract that can't be maintained in a reasonable
|
|
||||||
manner.
|
|
||||||
|
|
||||||
Also, due to the specific nature, cgroup and its controllers don't
|
|
||||||
tend to attract attention from a wide scope of developers. cgroup's
|
|
||||||
short history is already fraught with severely mis-designed
|
|
||||||
interfaces, unnecessary commitments to and exposing of internal
|
|
||||||
details, broken and dangerous implementations of various features.
|
|
||||||
|
|
||||||
Keeping cgroup as an administration interface is both advantageous for
|
|
||||||
its role and imperative given its nature. Some of the cgroup features
|
|
||||||
may make sense for unprivileged access. If deemed justified, those
|
|
||||||
must be further abstracted and implemented as a different interface,
|
|
||||||
be it a system call or process-private filesystem, and survive through
|
|
||||||
the scrutiny that any interface for general consumption is required to
|
|
||||||
go through.
|
|
||||||
|
|
||||||
Requiring CAP is not a complete solution but should serve as a
|
|
||||||
significant deterrent against spraying cgroup usages in non-privileged
|
|
||||||
programs.
|
|
||||||
@@ -34,17 +34,12 @@ struct seq_file;
|
|||||||
|
|
||||||
/* define the enumeration of all cgroup subsystems */
|
/* define the enumeration of all cgroup subsystems */
|
||||||
#define SUBSYS(_x) _x ## _cgrp_id,
|
#define SUBSYS(_x) _x ## _cgrp_id,
|
||||||
#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
|
|
||||||
__unused_tag_ ## _t = CGROUP_ ## _t - 1,
|
|
||||||
enum cgroup_subsys_id {
|
enum cgroup_subsys_id {
|
||||||
#include <linux/cgroup_subsys.h>
|
#include <linux/cgroup_subsys.h>
|
||||||
CGROUP_SUBSYS_COUNT,
|
CGROUP_SUBSYS_COUNT,
|
||||||
};
|
};
|
||||||
#undef SUBSYS_TAG
|
|
||||||
#undef SUBSYS
|
#undef SUBSYS
|
||||||
|
|
||||||
#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
|
|
||||||
|
|
||||||
/* bits in struct cgroup_subsys_state flags field */
|
/* bits in struct cgroup_subsys_state flags field */
|
||||||
enum {
|
enum {
|
||||||
CSS_NO_REF = (1 << 0), /* no reference counting for this css */
|
CSS_NO_REF = (1 << 0), /* no reference counting for this css */
|
||||||
@@ -66,7 +61,6 @@ enum {
|
|||||||
|
|
||||||
/* cgroup_root->flags */
|
/* cgroup_root->flags */
|
||||||
enum {
|
enum {
|
||||||
CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
|
|
||||||
CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
|
CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
|
||||||
CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
|
CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
|
||||||
};
|
};
|
||||||
@@ -439,9 +433,9 @@ struct cgroup_subsys {
|
|||||||
int (*can_attach)(struct cgroup_taskset *tset);
|
int (*can_attach)(struct cgroup_taskset *tset);
|
||||||
void (*cancel_attach)(struct cgroup_taskset *tset);
|
void (*cancel_attach)(struct cgroup_taskset *tset);
|
||||||
void (*attach)(struct cgroup_taskset *tset);
|
void (*attach)(struct cgroup_taskset *tset);
|
||||||
int (*can_fork)(struct task_struct *task, void **priv_p);
|
int (*can_fork)(struct task_struct *task);
|
||||||
void (*cancel_fork)(struct task_struct *task, void *priv);
|
void (*cancel_fork)(struct task_struct *task);
|
||||||
void (*fork)(struct task_struct *task, void *priv);
|
void (*fork)(struct task_struct *task);
|
||||||
void (*exit)(struct task_struct *task);
|
void (*exit)(struct task_struct *task);
|
||||||
void (*free)(struct task_struct *task);
|
void (*free)(struct task_struct *task);
|
||||||
void (*bind)(struct cgroup_subsys_state *root_css);
|
void (*bind)(struct cgroup_subsys_state *root_css);
|
||||||
@@ -527,7 +521,6 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
|
|||||||
|
|
||||||
#else /* CONFIG_CGROUPS */
|
#else /* CONFIG_CGROUPS */
|
||||||
|
|
||||||
#define CGROUP_CANFORK_COUNT 0
|
|
||||||
#define CGROUP_SUBSYS_COUNT 0
|
#define CGROUP_SUBSYS_COUNT 0
|
||||||
|
|
||||||
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
|
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
|
||||||
|
|||||||
@@ -97,12 +97,9 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|||||||
struct pid *pid, struct task_struct *tsk);
|
struct pid *pid, struct task_struct *tsk);
|
||||||
|
|
||||||
void cgroup_fork(struct task_struct *p);
|
void cgroup_fork(struct task_struct *p);
|
||||||
extern int cgroup_can_fork(struct task_struct *p,
|
extern int cgroup_can_fork(struct task_struct *p);
|
||||||
void *ss_priv[CGROUP_CANFORK_COUNT]);
|
extern void cgroup_cancel_fork(struct task_struct *p);
|
||||||
extern void cgroup_cancel_fork(struct task_struct *p,
|
extern void cgroup_post_fork(struct task_struct *p);
|
||||||
void *ss_priv[CGROUP_CANFORK_COUNT]);
|
|
||||||
extern void cgroup_post_fork(struct task_struct *p,
|
|
||||||
void *old_ss_priv[CGROUP_CANFORK_COUNT]);
|
|
||||||
void cgroup_exit(struct task_struct *p);
|
void cgroup_exit(struct task_struct *p);
|
||||||
void cgroup_free(struct task_struct *p);
|
void cgroup_free(struct task_struct *p);
|
||||||
|
|
||||||
@@ -562,13 +559,9 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
|
|||||||
struct dentry *dentry) { return -EINVAL; }
|
struct dentry *dentry) { return -EINVAL; }
|
||||||
|
|
||||||
static inline void cgroup_fork(struct task_struct *p) {}
|
static inline void cgroup_fork(struct task_struct *p) {}
|
||||||
static inline int cgroup_can_fork(struct task_struct *p,
|
static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
|
||||||
void *ss_priv[CGROUP_CANFORK_COUNT])
|
static inline void cgroup_cancel_fork(struct task_struct *p) {}
|
||||||
{ return 0; }
|
static inline void cgroup_post_fork(struct task_struct *p) {}
|
||||||
static inline void cgroup_cancel_fork(struct task_struct *p,
|
|
||||||
void *ss_priv[CGROUP_CANFORK_COUNT]) {}
|
|
||||||
static inline void cgroup_post_fork(struct task_struct *p,
|
|
||||||
void *ss_priv[CGROUP_CANFORK_COUNT]) {}
|
|
||||||
static inline void cgroup_exit(struct task_struct *p) {}
|
static inline void cgroup_exit(struct task_struct *p) {}
|
||||||
static inline void cgroup_free(struct task_struct *p) {}
|
static inline void cgroup_free(struct task_struct *p) {}
|
||||||
|
|
||||||
|
|||||||
@@ -6,14 +6,8 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* This file *must* be included with SUBSYS() defined.
|
* This file *must* be included with SUBSYS() defined.
|
||||||
* SUBSYS_TAG() is a noop if undefined.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef SUBSYS_TAG
|
|
||||||
#define __TMP_SUBSYS_TAG
|
|
||||||
#define SUBSYS_TAG(_x)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if IS_ENABLED(CONFIG_CPUSETS)
|
#if IS_ENABLED(CONFIG_CPUSETS)
|
||||||
SUBSYS(cpuset)
|
SUBSYS(cpuset)
|
||||||
#endif
|
#endif
|
||||||
@@ -58,17 +52,10 @@ SUBSYS(net_prio)
|
|||||||
SUBSYS(hugetlb)
|
SUBSYS(hugetlb)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
|
||||||
* Subsystems that implement the can_fork() family of callbacks.
|
|
||||||
*/
|
|
||||||
SUBSYS_TAG(CANFORK_START)
|
|
||||||
|
|
||||||
#if IS_ENABLED(CONFIG_CGROUP_PIDS)
|
#if IS_ENABLED(CONFIG_CGROUP_PIDS)
|
||||||
SUBSYS(pids)
|
SUBSYS(pids)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
SUBSYS_TAG(CANFORK_END)
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The following subsystems are not supported on the default hierarchy.
|
* The following subsystems are not supported on the default hierarchy.
|
||||||
*/
|
*/
|
||||||
@@ -76,11 +63,6 @@ SUBSYS_TAG(CANFORK_END)
|
|||||||
SUBSYS(debug)
|
SUBSYS(debug)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __TMP_SUBSYS_TAG
|
|
||||||
#undef __TMP_SUBSYS_TAG
|
|
||||||
#undef SUBSYS_TAG
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
|
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -54,6 +54,7 @@
|
|||||||
|
|
||||||
#define SMB_SUPER_MAGIC 0x517B
|
#define SMB_SUPER_MAGIC 0x517B
|
||||||
#define CGROUP_SUPER_MAGIC 0x27e0eb
|
#define CGROUP_SUPER_MAGIC 0x27e0eb
|
||||||
|
#define CGROUP2_SUPER_MAGIC 0x63677270
|
||||||
|
|
||||||
|
|
||||||
#define STACK_END_MAGIC 0x57AC6E9D
|
#define STACK_END_MAGIC 0x57AC6E9D
|
||||||
|
|||||||
245
init/Kconfig
245
init/Kconfig
@@ -940,95 +940,24 @@ menuconfig CGROUPS
|
|||||||
|
|
||||||
if CGROUPS
|
if CGROUPS
|
||||||
|
|
||||||
config CGROUP_DEBUG
|
|
||||||
bool "Example debug cgroup subsystem"
|
|
||||||
default n
|
|
||||||
help
|
|
||||||
This option enables a simple cgroup subsystem that
|
|
||||||
exports useful debugging information about the cgroups
|
|
||||||
framework.
|
|
||||||
|
|
||||||
Say N if unsure.
|
|
||||||
|
|
||||||
config CGROUP_FREEZER
|
|
||||||
bool "Freezer cgroup subsystem"
|
|
||||||
help
|
|
||||||
Provides a way to freeze and unfreeze all tasks in a
|
|
||||||
cgroup.
|
|
||||||
|
|
||||||
config CGROUP_PIDS
|
|
||||||
bool "PIDs cgroup subsystem"
|
|
||||||
help
|
|
||||||
Provides enforcement of process number limits in the scope of a
|
|
||||||
cgroup. Any attempt to fork more processes than is allowed in the
|
|
||||||
cgroup will fail. PIDs are fundamentally a global resource because it
|
|
||||||
is fairly trivial to reach PID exhaustion before you reach even a
|
|
||||||
conservative kmemcg limit. As a result, it is possible to grind a
|
|
||||||
system to halt without being limited by other cgroup policies. The
|
|
||||||
PIDs cgroup subsystem is designed to stop this from happening.
|
|
||||||
|
|
||||||
It should be noted that organisational operations (such as attaching
|
|
||||||
to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
|
|
||||||
since the PIDs limit only affects a process's ability to fork, not to
|
|
||||||
attach to a cgroup.
|
|
||||||
|
|
||||||
config CGROUP_DEVICE
|
|
||||||
bool "Device controller for cgroups"
|
|
||||||
help
|
|
||||||
Provides a cgroup implementing whitelists for devices which
|
|
||||||
a process in the cgroup can mknod or open.
|
|
||||||
|
|
||||||
config CPUSETS
|
|
||||||
bool "Cpuset support"
|
|
||||||
help
|
|
||||||
This option will let you create and manage CPUSETs which
|
|
||||||
allow dynamically partitioning a system into sets of CPUs and
|
|
||||||
Memory Nodes and assigning tasks to run only within those sets.
|
|
||||||
This is primarily useful on large SMP or NUMA systems.
|
|
||||||
|
|
||||||
Say N if unsure.
|
|
||||||
|
|
||||||
config PROC_PID_CPUSET
|
|
||||||
bool "Include legacy /proc/<pid>/cpuset file"
|
|
||||||
depends on CPUSETS
|
|
||||||
default y
|
|
||||||
|
|
||||||
config CGROUP_CPUACCT
|
|
||||||
bool "Simple CPU accounting cgroup subsystem"
|
|
||||||
help
|
|
||||||
Provides a simple Resource Controller for monitoring the
|
|
||||||
total CPU consumed by the tasks in a cgroup.
|
|
||||||
|
|
||||||
config PAGE_COUNTER
|
config PAGE_COUNTER
|
||||||
bool
|
bool
|
||||||
|
|
||||||
config MEMCG
|
config MEMCG
|
||||||
bool "Memory Resource Controller for Control Groups"
|
bool "Memory controller"
|
||||||
select PAGE_COUNTER
|
select PAGE_COUNTER
|
||||||
select EVENTFD
|
select EVENTFD
|
||||||
help
|
help
|
||||||
Provides a memory resource controller that manages both anonymous
|
Provides control over the memory footprint of tasks in a cgroup.
|
||||||
memory and page cache. (See Documentation/cgroups/memory.txt)
|
|
||||||
|
|
||||||
config MEMCG_SWAP
|
config MEMCG_SWAP
|
||||||
bool "Memory Resource Controller Swap Extension"
|
bool "Swap controller"
|
||||||
depends on MEMCG && SWAP
|
depends on MEMCG && SWAP
|
||||||
help
|
help
|
||||||
Add swap management feature to memory resource controller. When you
|
Provides control over the swap space consumed by tasks in a cgroup.
|
||||||
enable this, you can limit mem+swap usage per cgroup. In other words,
|
|
||||||
when you disable this, memory resource controller has no cares to
|
|
||||||
usage of swap...a process can exhaust all of the swap. This extension
|
|
||||||
is useful when you want to avoid exhaustion swap but this itself
|
|
||||||
adds more overheads and consumes memory for remembering information.
|
|
||||||
Especially if you use 32bit system or small memory system, please
|
|
||||||
be careful about enabling this. When memory resource controller
|
|
||||||
is disabled by boot option, this will be automatically disabled and
|
|
||||||
there will be no overhead from this. Even when you set this config=y,
|
|
||||||
if boot option "swapaccount=0" is set, swap will not be accounted.
|
|
||||||
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
|
|
||||||
size is 4096bytes, 512k per 1Gbytes of swap.
|
|
||||||
config MEMCG_SWAP_ENABLED
|
config MEMCG_SWAP_ENABLED
|
||||||
bool "Memory Resource Controller Swap Extension enabled by default"
|
bool "Swap controller enabled by default"
|
||||||
depends on MEMCG_SWAP
|
depends on MEMCG_SWAP
|
||||||
default y
|
default y
|
||||||
help
|
help
|
||||||
@@ -1052,34 +981,43 @@ config MEMCG_KMEM
|
|||||||
the kmem extension can use it to guarantee that no group of processes
|
the kmem extension can use it to guarantee that no group of processes
|
||||||
will ever exhaust kernel resources alone.
|
will ever exhaust kernel resources alone.
|
||||||
|
|
||||||
config CGROUP_HUGETLB
|
config BLK_CGROUP
|
||||||
bool "HugeTLB Resource Controller for Control Groups"
|
bool "IO controller"
|
||||||
depends on HUGETLB_PAGE
|
depends on BLOCK
|
||||||
select PAGE_COUNTER
|
|
||||||
default n
|
default n
|
||||||
help
|
---help---
|
||||||
Provides a cgroup Resource Controller for HugeTLB pages.
|
Generic block IO controller cgroup interface. This is the common
|
||||||
When you enable this, you can put a per cgroup limit on HugeTLB usage.
|
cgroup interface which should be used by various IO controlling
|
||||||
The limit is enforced during page fault. Since HugeTLB doesn't
|
policies.
|
||||||
support page reclaim, enforcing the limit at page fault time implies
|
|
||||||
that, the application will get SIGBUS signal if it tries to access
|
|
||||||
HugeTLB pages beyond its limit. This requires the application to know
|
|
||||||
beforehand how much HugeTLB pages it would require for its use. The
|
|
||||||
control group is tracked in the third page lru pointer. This means
|
|
||||||
that we cannot use the controller with huge page less than 3 pages.
|
|
||||||
|
|
||||||
config CGROUP_PERF
|
Currently, CFQ IO scheduler uses it to recognize task groups and
|
||||||
bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
|
control disk bandwidth allocation (proportional time slice allocation)
|
||||||
depends on PERF_EVENTS && CGROUPS
|
to such task groups. It is also used by bio throttling logic in
|
||||||
help
|
block layer to implement upper limit in IO rates on a device.
|
||||||
This option extends the per-cpu mode to restrict monitoring to
|
|
||||||
threads which belong to the cgroup specified and run on the
|
|
||||||
designated cpu.
|
|
||||||
|
|
||||||
Say N if unsure.
|
This option only enables generic Block IO controller infrastructure.
|
||||||
|
One needs to also enable actual IO controlling logic/policy. For
|
||||||
|
enabling proportional weight division of disk bandwidth in CFQ, set
|
||||||
|
CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
|
||||||
|
CONFIG_BLK_DEV_THROTTLING=y.
|
||||||
|
|
||||||
|
See Documentation/cgroups/blkio-controller.txt for more information.
|
||||||
|
|
||||||
|
config DEBUG_BLK_CGROUP
|
||||||
|
bool "IO controller debugging"
|
||||||
|
depends on BLK_CGROUP
|
||||||
|
default n
|
||||||
|
---help---
|
||||||
|
Enable some debugging help. Currently it exports additional stat
|
||||||
|
files in a cgroup which can be useful for debugging.
|
||||||
|
|
||||||
|
config CGROUP_WRITEBACK
|
||||||
|
bool
|
||||||
|
depends on MEMCG && BLK_CGROUP
|
||||||
|
default y
|
||||||
|
|
||||||
menuconfig CGROUP_SCHED
|
menuconfig CGROUP_SCHED
|
||||||
bool "Group CPU scheduler"
|
bool "CPU controller"
|
||||||
default n
|
default n
|
||||||
help
|
help
|
||||||
This feature lets CPU scheduler recognize task groups and control CPU
|
This feature lets CPU scheduler recognize task groups and control CPU
|
||||||
@@ -1116,41 +1054,90 @@ config RT_GROUP_SCHED
|
|||||||
|
|
||||||
endif #CGROUP_SCHED
|
endif #CGROUP_SCHED
|
||||||
|
|
||||||
config BLK_CGROUP
|
config CGROUP_PIDS
|
||||||
bool "Block IO controller"
|
bool "PIDs controller"
|
||||||
depends on BLOCK
|
help
|
||||||
|
Provides enforcement of process number limits in the scope of a
|
||||||
|
cgroup. Any attempt to fork more processes than is allowed in the
|
||||||
|
cgroup will fail. PIDs are fundamentally a global resource because it
|
||||||
|
is fairly trivial to reach PID exhaustion before you reach even a
|
||||||
|
conservative kmemcg limit. As a result, it is possible to grind a
|
||||||
|
system to halt without being limited by other cgroup policies. The
|
||||||
|
PIDs cgroup subsystem is designed to stop this from happening.
|
||||||
|
|
||||||
|
It should be noted that organisational operations (such as attaching
|
||||||
|
to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
|
||||||
|
since the PIDs limit only affects a process's ability to fork, not to
|
||||||
|
attach to a cgroup.
|
||||||
|
|
||||||
|
config CGROUP_FREEZER
|
||||||
|
bool "Freezer controller"
|
||||||
|
help
|
||||||
|
Provides a way to freeze and unfreeze all tasks in a
|
||||||
|
cgroup.
|
||||||
|
|
||||||
|
config CGROUP_HUGETLB
|
||||||
|
bool "HugeTLB controller"
|
||||||
|
depends on HUGETLB_PAGE
|
||||||
|
select PAGE_COUNTER
|
||||||
default n
|
default n
|
||||||
---help---
|
help
|
||||||
Generic block IO controller cgroup interface. This is the common
|
Provides a cgroup controller for HugeTLB pages.
|
||||||
cgroup interface which should be used by various IO controlling
|
When you enable this, you can put a per cgroup limit on HugeTLB usage.
|
||||||
policies.
|
The limit is enforced during page fault. Since HugeTLB doesn't
|
||||||
|
support page reclaim, enforcing the limit at page fault time implies
|
||||||
|
that, the application will get SIGBUS signal if it tries to access
|
||||||
|
HugeTLB pages beyond its limit. This requires the application to know
|
||||||
|
beforehand how much HugeTLB pages it would require for its use. The
|
||||||
|
control group is tracked in the third page lru pointer. This means
|
||||||
|
that we cannot use the controller with huge page less than 3 pages.
|
||||||
|
|
||||||
Currently, CFQ IO scheduler uses it to recognize task groups and
|
config CPUSETS
|
||||||
control disk bandwidth allocation (proportional time slice allocation)
|
bool "Cpuset controller"
|
||||||
to such task groups. It is also used by bio throttling logic in
|
help
|
||||||
block layer to implement upper limit in IO rates on a device.
|
This option will let you create and manage CPUSETs which
|
||||||
|
allow dynamically partitioning a system into sets of CPUs and
|
||||||
|
Memory Nodes and assigning tasks to run only within those sets.
|
||||||
|
This is primarily useful on large SMP or NUMA systems.
|
||||||
|
|
||||||
This option only enables generic Block IO controller infrastructure.
|
Say N if unsure.
|
||||||
One needs to also enable actual IO controlling logic/policy. For
|
|
||||||
enabling proportional weight division of disk bandwidth in CFQ, set
|
|
||||||
CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
|
|
||||||
CONFIG_BLK_DEV_THROTTLING=y.
|
|
||||||
|
|
||||||
See Documentation/cgroups/blkio-controller.txt for more information.
|
config PROC_PID_CPUSET
|
||||||
|
bool "Include legacy /proc/<pid>/cpuset file"
|
||||||
config DEBUG_BLK_CGROUP
|
depends on CPUSETS
|
||||||
bool "Enable Block IO controller debugging"
|
|
||||||
depends on BLK_CGROUP
|
|
||||||
default n
|
|
||||||
---help---
|
|
||||||
Enable some debugging help. Currently it exports additional stat
|
|
||||||
files in a cgroup which can be useful for debugging.
|
|
||||||
|
|
||||||
config CGROUP_WRITEBACK
|
|
||||||
bool
|
|
||||||
depends on MEMCG && BLK_CGROUP
|
|
||||||
default y
|
default y
|
||||||
|
|
||||||
|
config CGROUP_DEVICE
|
||||||
|
bool "Device controller"
|
||||||
|
help
|
||||||
|
Provides a cgroup controller implementing whitelists for
|
||||||
|
devices which a process in the cgroup can mknod or open.
|
||||||
|
|
||||||
|
config CGROUP_CPUACCT
|
||||||
|
bool "Simple CPU accounting controller"
|
||||||
|
help
|
||||||
|
Provides a simple controller for monitoring the
|
||||||
|
total CPU consumed by the tasks in a cgroup.
|
||||||
|
|
||||||
|
config CGROUP_PERF
|
||||||
|
bool "Perf controller"
|
||||||
|
depends on PERF_EVENTS
|
||||||
|
help
|
||||||
|
This option extends the perf per-cpu mode to restrict monitoring
|
||||||
|
to threads which belong to the cgroup specified and run on the
|
||||||
|
designated cpu.
|
||||||
|
|
||||||
|
Say N if unsure.
|
||||||
|
|
||||||
|
config CGROUP_DEBUG
|
||||||
|
bool "Example controller"
|
||||||
|
default n
|
||||||
|
help
|
||||||
|
This option enables a simple controller that exports
|
||||||
|
debugging information about the cgroups framework.
|
||||||
|
|
||||||
|
Say N.
|
||||||
|
|
||||||
endif # CGROUPS
|
endif # CGROUPS
|
||||||
|
|
||||||
config CHECKPOINT_RESTORE
|
config CHECKPOINT_RESTORE
|
||||||
|
|||||||
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
|
|||||||
/* Ditto for the can_fork callback. */
|
/* Ditto for the can_fork callback. */
|
||||||
static unsigned long have_canfork_callback __read_mostly;
|
static unsigned long have_canfork_callback __read_mostly;
|
||||||
|
|
||||||
|
static struct file_system_type cgroup2_fs_type;
|
||||||
static struct cftype cgroup_dfl_base_files[];
|
static struct cftype cgroup_dfl_base_files[];
|
||||||
static struct cftype cgroup_legacy_base_files[];
|
static struct cftype cgroup_legacy_base_files[];
|
||||||
|
|
||||||
@@ -1623,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
|
|||||||
all_ss = true;
|
all_ss = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!strcmp(token, "__DEVEL__sane_behavior")) {
|
|
||||||
opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!strcmp(token, "noprefix")) {
|
if (!strcmp(token, "noprefix")) {
|
||||||
opts->flags |= CGRP_ROOT_NOPREFIX;
|
opts->flags |= CGRP_ROOT_NOPREFIX;
|
||||||
continue;
|
continue;
|
||||||
@@ -1693,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
|
|||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
|
|
||||||
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
|
|
||||||
if (nr_opts != 1) {
|
|
||||||
pr_err("sane_behavior: no other mount options allowed\n");
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the 'all' option was specified select all the subsystems,
|
* If the 'all' option was specified select all the subsystems,
|
||||||
* otherwise if 'none', 'name=' and a subsystem name options were
|
* otherwise if 'none', 'name=' and a subsystem name options were
|
||||||
@@ -1981,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|||||||
int flags, const char *unused_dev_name,
|
int flags, const char *unused_dev_name,
|
||||||
void *data)
|
void *data)
|
||||||
{
|
{
|
||||||
|
bool is_v2 = fs_type == &cgroup2_fs_type;
|
||||||
struct super_block *pinned_sb = NULL;
|
struct super_block *pinned_sb = NULL;
|
||||||
struct cgroup_subsys *ss;
|
struct cgroup_subsys *ss;
|
||||||
struct cgroup_root *root;
|
struct cgroup_root *root;
|
||||||
@@ -1997,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|||||||
if (!use_task_css_set_links)
|
if (!use_task_css_set_links)
|
||||||
cgroup_enable_task_cg_lists();
|
cgroup_enable_task_cg_lists();
|
||||||
|
|
||||||
|
if (is_v2) {
|
||||||
|
if (data) {
|
||||||
|
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
|
||||||
|
return ERR_PTR(-EINVAL);
|
||||||
|
}
|
||||||
|
cgrp_dfl_root_visible = true;
|
||||||
|
root = &cgrp_dfl_root;
|
||||||
|
cgroup_get(&root->cgrp);
|
||||||
|
goto out_mount;
|
||||||
|
}
|
||||||
|
|
||||||
mutex_lock(&cgroup_mutex);
|
mutex_lock(&cgroup_mutex);
|
||||||
|
|
||||||
/* First find the desired set of subsystems */
|
/* First find the desired set of subsystems */
|
||||||
@@ -2004,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
|
||||||
/* look for a matching existing root */
|
|
||||||
if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
|
|
||||||
cgrp_dfl_root_visible = true;
|
|
||||||
root = &cgrp_dfl_root;
|
|
||||||
cgroup_get(&root->cgrp);
|
|
||||||
ret = 0;
|
|
||||||
goto out_unlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Destruction of cgroup root is asynchronous, so subsystems may
|
* Destruction of cgroup root is asynchronous, so subsystems may
|
||||||
* still be dying after the previous unmount. Let's drain the
|
* still be dying after the previous unmount. Let's drain the
|
||||||
@@ -2123,9 +2114,10 @@ out_free:
|
|||||||
|
|
||||||
if (ret)
|
if (ret)
|
||||||
return ERR_PTR(ret);
|
return ERR_PTR(ret);
|
||||||
|
out_mount:
|
||||||
dentry = kernfs_mount(fs_type, flags, root->kf_root,
|
dentry = kernfs_mount(fs_type, flags, root->kf_root,
|
||||||
CGROUP_SUPER_MAGIC, &new_sb);
|
is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
|
||||||
|
&new_sb);
|
||||||
if (IS_ERR(dentry) || !new_sb)
|
if (IS_ERR(dentry) || !new_sb)
|
||||||
cgroup_put(&root->cgrp);
|
cgroup_put(&root->cgrp);
|
||||||
|
|
||||||
@@ -2168,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = {
|
|||||||
.kill_sb = cgroup_kill_sb,
|
.kill_sb = cgroup_kill_sb,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static struct file_system_type cgroup2_fs_type = {
|
||||||
|
.name = "cgroup2",
|
||||||
|
.mount = cgroup_mount,
|
||||||
|
.kill_sb = cgroup_kill_sb,
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
|
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
|
||||||
* @task: target task
|
* @task: target task
|
||||||
@@ -4039,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
|
|||||||
goto out_err;
|
goto out_err;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Migrate tasks one-by-one until @form is empty. This fails iff
|
* Migrate tasks one-by-one until @from is empty. This fails iff
|
||||||
* ->can_attach() fails.
|
* ->can_attach() fails.
|
||||||
*/
|
*/
|
||||||
do {
|
do {
|
||||||
@@ -5171,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
|
|||||||
{
|
{
|
||||||
struct cgroup_subsys_state *css;
|
struct cgroup_subsys_state *css;
|
||||||
|
|
||||||
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
|
pr_debug("Initializing cgroup subsys %s\n", ss->name);
|
||||||
|
|
||||||
mutex_lock(&cgroup_mutex);
|
mutex_lock(&cgroup_mutex);
|
||||||
|
|
||||||
@@ -5329,6 +5327,7 @@ int __init cgroup_init(void)
|
|||||||
|
|
||||||
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
|
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
|
||||||
WARN_ON(register_filesystem(&cgroup_fs_type));
|
WARN_ON(register_filesystem(&cgroup_fs_type));
|
||||||
|
WARN_ON(register_filesystem(&cgroup2_fs_type));
|
||||||
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
|
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@@ -5472,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = {
|
|||||||
.release = single_release,
|
.release = single_release,
|
||||||
};
|
};
|
||||||
|
|
||||||
static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
|
|
||||||
{
|
|
||||||
if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
|
|
||||||
return &ss_priv[i - CGROUP_CANFORK_START];
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
|
|
||||||
{
|
|
||||||
void **private = subsys_canfork_priv_p(ss_priv, i);
|
|
||||||
return private ? *private : NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* cgroup_fork - initialize cgroup related fields during copy_process()
|
* cgroup_fork - initialize cgroup related fields during copy_process()
|
||||||
* @child: pointer to task_struct of forking parent process.
|
* @child: pointer to task_struct of forking parent process.
|
||||||
@@ -5507,14 +5493,13 @@ void cgroup_fork(struct task_struct *child)
|
|||||||
* returns an error, the fork aborts with that error code. This allows for
|
* returns an error, the fork aborts with that error code. This allows for
|
||||||
* a cgroup subsystem to conditionally allow or deny new forks.
|
* a cgroup subsystem to conditionally allow or deny new forks.
|
||||||
*/
|
*/
|
||||||
int cgroup_can_fork(struct task_struct *child,
|
int cgroup_can_fork(struct task_struct *child)
|
||||||
void *ss_priv[CGROUP_CANFORK_COUNT])
|
|
||||||
{
|
{
|
||||||
struct cgroup_subsys *ss;
|
struct cgroup_subsys *ss;
|
||||||
int i, j, ret;
|
int i, j, ret;
|
||||||
|
|
||||||
for_each_subsys_which(ss, i, &have_canfork_callback) {
|
for_each_subsys_which(ss, i, &have_canfork_callback) {
|
||||||
ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
|
ret = ss->can_fork(child);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out_revert;
|
goto out_revert;
|
||||||
}
|
}
|
||||||
@@ -5526,7 +5511,7 @@ out_revert:
|
|||||||
if (j >= i)
|
if (j >= i)
|
||||||
break;
|
break;
|
||||||
if (ss->cancel_fork)
|
if (ss->cancel_fork)
|
||||||
ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
|
ss->cancel_fork(child);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
@@ -5539,15 +5524,14 @@ out_revert:
|
|||||||
* This calls the cancel_fork() callbacks if a fork failed *after*
|
* This calls the cancel_fork() callbacks if a fork failed *after*
|
||||||
* cgroup_can_fork() succeded.
|
* cgroup_can_fork() succeded.
|
||||||
*/
|
*/
|
||||||
void cgroup_cancel_fork(struct task_struct *child,
|
void cgroup_cancel_fork(struct task_struct *child)
|
||||||
void *ss_priv[CGROUP_CANFORK_COUNT])
|
|
||||||
{
|
{
|
||||||
struct cgroup_subsys *ss;
|
struct cgroup_subsys *ss;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for_each_subsys(ss, i)
|
for_each_subsys(ss, i)
|
||||||
if (ss->cancel_fork)
|
if (ss->cancel_fork)
|
||||||
ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
|
ss->cancel_fork(child);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -5560,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child,
|
|||||||
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
|
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
|
||||||
* list.
|
* list.
|
||||||
*/
|
*/
|
||||||
void cgroup_post_fork(struct task_struct *child,
|
void cgroup_post_fork(struct task_struct *child)
|
||||||
void *old_ss_priv[CGROUP_CANFORK_COUNT])
|
|
||||||
{
|
{
|
||||||
struct cgroup_subsys *ss;
|
struct cgroup_subsys *ss;
|
||||||
int i;
|
int i;
|
||||||
@@ -5605,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child,
|
|||||||
* and addition to css_set.
|
* and addition to css_set.
|
||||||
*/
|
*/
|
||||||
for_each_subsys_which(ss, i, &have_fork_callback)
|
for_each_subsys_which(ss, i, &have_fork_callback)
|
||||||
ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
|
ss->fork(child);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
|
|||||||
* to do anything as freezer_attach() will put @task into the appropriate
|
* to do anything as freezer_attach() will put @task into the appropriate
|
||||||
* state.
|
* state.
|
||||||
*/
|
*/
|
||||||
static void freezer_fork(struct task_struct *task, void *private)
|
static void freezer_fork(struct task_struct *task)
|
||||||
{
|
{
|
||||||
struct freezer *freezer;
|
struct freezer *freezer;
|
||||||
|
|
||||||
|
|||||||
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
|
|||||||
*
|
*
|
||||||
* This function follows the set limit. It will fail if the charge would cause
|
* This function follows the set limit. It will fail if the charge would cause
|
||||||
* the new value to exceed the hierarchical limit. Returns 0 if the charge
|
* the new value to exceed the hierarchical limit. Returns 0 if the charge
|
||||||
* succeded, otherwise -EAGAIN.
|
* succeeded, otherwise -EAGAIN.
|
||||||
*/
|
*/
|
||||||
static int pids_try_charge(struct pids_cgroup *pids, int num)
|
static int pids_try_charge(struct pids_cgroup *pids, int num)
|
||||||
{
|
{
|
||||||
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
|
|||||||
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
|
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
|
||||||
* on threadgroup_change_begin() held by the copy_process().
|
* on threadgroup_change_begin() held by the copy_process().
|
||||||
*/
|
*/
|
||||||
static int pids_can_fork(struct task_struct *task, void **priv_p)
|
static int pids_can_fork(struct task_struct *task)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys_state *css;
|
struct cgroup_subsys_state *css;
|
||||||
struct pids_cgroup *pids;
|
struct pids_cgroup *pids;
|
||||||
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
|
|||||||
return pids_try_charge(pids, 1);
|
return pids_try_charge(pids, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void pids_cancel_fork(struct task_struct *task, void *priv)
|
static void pids_cancel_fork(struct task_struct *task)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys_state *css;
|
struct cgroup_subsys_state *css;
|
||||||
struct pids_cgroup *pids;
|
struct pids_cgroup *pids;
|
||||||
|
|||||||
@@ -51,6 +51,7 @@
|
|||||||
#include <linux/stat.h>
|
#include <linux/stat.h>
|
||||||
#include <linux/string.h>
|
#include <linux/string.h>
|
||||||
#include <linux/time.h>
|
#include <linux/time.h>
|
||||||
|
#include <linux/time64.h>
|
||||||
#include <linux/backing-dev.h>
|
#include <linux/backing-dev.h>
|
||||||
#include <linux/sort.h>
|
#include <linux/sort.h>
|
||||||
|
|
||||||
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
|
|||||||
struct fmeter {
|
struct fmeter {
|
||||||
int cnt; /* unprocessed events count */
|
int cnt; /* unprocessed events count */
|
||||||
int val; /* most recent output value */
|
int val; /* most recent output value */
|
||||||
time_t time; /* clock (secs) when val computed */
|
time64_t time; /* clock (secs) when val computed */
|
||||||
spinlock_t lock; /* guards read or write of above */
|
spinlock_t lock; /* guards read or write of above */
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1374,7 +1375,7 @@ out:
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
|
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
|
||||||
#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
|
#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
|
||||||
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
|
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
|
||||||
#define FM_SCALE 1000 /* faux fixed point scale */
|
#define FM_SCALE 1000 /* faux fixed point scale */
|
||||||
|
|
||||||
@@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp)
|
|||||||
/* Internal meter update - process cnt events and update value */
|
/* Internal meter update - process cnt events and update value */
|
||||||
static void fmeter_update(struct fmeter *fmp)
|
static void fmeter_update(struct fmeter *fmp)
|
||||||
{
|
{
|
||||||
time_t now = get_seconds();
|
time64_t now;
|
||||||
time_t ticks = now - fmp->time;
|
u32 ticks;
|
||||||
|
|
||||||
|
now = ktime_get_seconds();
|
||||||
|
ticks = now - fmp->time;
|
||||||
|
|
||||||
if (ticks == 0)
|
if (ticks == 0)
|
||||||
return;
|
return;
|
||||||
|
|||||||
@@ -1250,7 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||||||
{
|
{
|
||||||
int retval;
|
int retval;
|
||||||
struct task_struct *p;
|
struct task_struct *p;
|
||||||
void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
|
|
||||||
|
|
||||||
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
|
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
|
||||||
return ERR_PTR(-EINVAL);
|
return ERR_PTR(-EINVAL);
|
||||||
@@ -1527,7 +1526,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||||||
* between here and cgroup_post_fork() if an organisation operation is in
|
* between here and cgroup_post_fork() if an organisation operation is in
|
||||||
* progress.
|
* progress.
|
||||||
*/
|
*/
|
||||||
retval = cgroup_can_fork(p, cgrp_ss_priv);
|
retval = cgroup_can_fork(p);
|
||||||
if (retval)
|
if (retval)
|
||||||
goto bad_fork_free_pid;
|
goto bad_fork_free_pid;
|
||||||
|
|
||||||
@@ -1609,7 +1608,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||||||
write_unlock_irq(&tasklist_lock);
|
write_unlock_irq(&tasklist_lock);
|
||||||
|
|
||||||
proc_fork_connector(p);
|
proc_fork_connector(p);
|
||||||
cgroup_post_fork(p, cgrp_ss_priv);
|
cgroup_post_fork(p);
|
||||||
threadgroup_change_end(current);
|
threadgroup_change_end(current);
|
||||||
perf_event_fork(p);
|
perf_event_fork(p);
|
||||||
|
|
||||||
@@ -1619,7 +1618,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||||||
return p;
|
return p;
|
||||||
|
|
||||||
bad_fork_cancel_cgroup:
|
bad_fork_cancel_cgroup:
|
||||||
cgroup_cancel_fork(p, cgrp_ss_priv);
|
cgroup_cancel_fork(p);
|
||||||
bad_fork_free_pid:
|
bad_fork_free_pid:
|
||||||
if (pid != &init_struct_pid)
|
if (pid != &init_struct_pid)
|
||||||
free_pid(pid);
|
free_pid(pid);
|
||||||
|
|||||||
@@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
|
|||||||
sched_offline_group(tg);
|
sched_offline_group(tg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cpu_cgroup_fork(struct task_struct *task, void *private)
|
static void cpu_cgroup_fork(struct task_struct *task)
|
||||||
{
|
{
|
||||||
sched_move_task(task);
|
sched_move_task(task);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4813,7 +4813,7 @@ static void mem_cgroup_clear_mc(void)
|
|||||||
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
|
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
|
||||||
{
|
{
|
||||||
struct cgroup_subsys_state *css;
|
struct cgroup_subsys_state *css;
|
||||||
struct mem_cgroup *memcg;
|
struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
|
||||||
struct mem_cgroup *from;
|
struct mem_cgroup *from;
|
||||||
struct task_struct *leader, *p;
|
struct task_struct *leader, *p;
|
||||||
struct mm_struct *mm;
|
struct mm_struct *mm;
|
||||||
|
|||||||
Reference in New Issue
Block a user