forked from Minki/linux
Merge branch 'rcu/doc' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/urgent
Pull RCU documentation update for reducing OS jitter due to per-CPU kthreads, from Paul McKenney. Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
fd29f424d4
@ -217,9 +217,14 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
whether the increased speed is worth it.
|
whether the increased speed is worth it.
|
||||||
|
|
||||||
8. Although synchronize_rcu() is slower than is call_rcu(), it
|
8. Although synchronize_rcu() is slower than is call_rcu(), it
|
||||||
usually results in simpler code. So, unless update performance
|
usually results in simpler code. So, unless update performance is
|
||||||
is critically important or the updaters cannot block,
|
critically important, the updaters cannot block, or the latency of
|
||||||
synchronize_rcu() should be used in preference to call_rcu().
|
synchronize_rcu() is visible from userspace, synchronize_rcu()
|
||||||
|
should be used in preference to call_rcu(). Furthermore,
|
||||||
|
kfree_rcu() usually results in even simpler code than does
|
||||||
|
synchronize_rcu() without synchronize_rcu()'s multi-millisecond
|
||||||
|
latency. So please take advantage of kfree_rcu()'s "fire and
|
||||||
|
forget" memory-freeing capabilities where it applies.
|
||||||
|
|
||||||
An especially important property of the synchronize_rcu()
|
An especially important property of the synchronize_rcu()
|
||||||
primitive is that it automatically self-limits: if grace periods
|
primitive is that it automatically self-limits: if grace periods
|
||||||
@ -268,7 +273,8 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
e. Periodically invoke synchronize_rcu(), permitting a limited
|
e. Periodically invoke synchronize_rcu(), permitting a limited
|
||||||
number of updates per grace period.
|
number of updates per grace period.
|
||||||
|
|
||||||
The same cautions apply to call_rcu_bh() and call_rcu_sched().
|
The same cautions apply to call_rcu_bh(), call_rcu_sched(),
|
||||||
|
call_srcu(), and kfree_rcu().
|
||||||
|
|
||||||
9. All RCU list-traversal primitives, which include
|
9. All RCU list-traversal primitives, which include
|
||||||
rcu_dereference(), list_for_each_entry_rcu(), and
|
rcu_dereference(), list_for_each_entry_rcu(), and
|
||||||
@ -296,9 +302,9 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
all currently executing rcu_read_lock()-protected RCU read-side
|
all currently executing rcu_read_lock()-protected RCU read-side
|
||||||
critical sections complete. It does -not- necessarily guarantee
|
critical sections complete. It does -not- necessarily guarantee
|
||||||
that all currently running interrupts, NMIs, preempt_disable()
|
that all currently running interrupts, NMIs, preempt_disable()
|
||||||
code, or idle loops will complete. Therefore, if you do not have
|
code, or idle loops will complete. Therefore, if your
|
||||||
rcu_read_lock()-protected read-side critical sections, do -not-
|
read-side critical sections are protected by something other
|
||||||
use synchronize_rcu().
|
than rcu_read_lock(), do -not- use synchronize_rcu().
|
||||||
|
|
||||||
Similarly, disabling preemption is not an acceptable substitute
|
Similarly, disabling preemption is not an acceptable substitute
|
||||||
for rcu_read_lock(). Code that attempts to use preemption
|
for rcu_read_lock(). Code that attempts to use preemption
|
||||||
@ -401,9 +407,9 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
read-side critical sections. It is the responsibility of the
|
read-side critical sections. It is the responsibility of the
|
||||||
RCU update-side primitives to deal with this.
|
RCU update-side primitives to deal with this.
|
||||||
|
|
||||||
17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
|
17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||||
the __rcu sparse checks to validate your RCU code. These
|
__rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to
|
||||||
can help find problems as follows:
|
validate your RCU code. These can help find problems as follows:
|
||||||
|
|
||||||
CONFIG_PROVE_RCU: check that accesses to RCU-protected data
|
CONFIG_PROVE_RCU: check that accesses to RCU-protected data
|
||||||
structures are carried out under the proper RCU
|
structures are carried out under the proper RCU
|
||||||
|
@ -64,6 +64,11 @@ checking of rcu_dereference() primitives:
|
|||||||
but retain the compiler constraints that prevent duplicating
|
but retain the compiler constraints that prevent duplicating
|
||||||
or coalescsing. This is useful when when testing the
|
or coalescsing. This is useful when when testing the
|
||||||
value of the pointer itself, for example, against NULL.
|
value of the pointer itself, for example, against NULL.
|
||||||
|
rcu_access_index(idx):
|
||||||
|
Return the value of the index and omit all barriers, but
|
||||||
|
retain the compiler constraints that prevent duplicating
|
||||||
|
or coalescsing. This is useful when when testing the
|
||||||
|
value of the index itself, for example, against -1.
|
||||||
|
|
||||||
The rcu_dereference_check() check expression can be any boolean
|
The rcu_dereference_check() check expression can be any boolean
|
||||||
expression, but would normally include a lockdep expression. However,
|
expression, but would normally include a lockdep expression. However,
|
||||||
|
@ -79,7 +79,20 @@ complete. Pseudo-code using rcu_barrier() is as follows:
|
|||||||
2. Execute rcu_barrier().
|
2. Execute rcu_barrier().
|
||||||
3. Allow the module to be unloaded.
|
3. Allow the module to be unloaded.
|
||||||
|
|
||||||
The rcutorture module makes use of rcu_barrier in its exit function
|
There are also rcu_barrier_bh(), rcu_barrier_sched(), and srcu_barrier()
|
||||||
|
functions for the other flavors of RCU, and you of course must match
|
||||||
|
the flavor of rcu_barrier() with that of call_rcu(). If your module
|
||||||
|
uses multiple flavors of call_rcu(), then it must also use multiple
|
||||||
|
flavors of rcu_barrier() when unloading that module. For example, if
|
||||||
|
it uses call_rcu_bh(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||||
|
srcu_struct_2(), then the following three lines of code will be required
|
||||||
|
when unloading:
|
||||||
|
|
||||||
|
1 rcu_barrier_bh();
|
||||||
|
2 srcu_barrier(&srcu_struct_1);
|
||||||
|
3 srcu_barrier(&srcu_struct_2);
|
||||||
|
|
||||||
|
The rcutorture module makes use of rcu_barrier() in its exit function
|
||||||
as follows:
|
as follows:
|
||||||
|
|
||||||
1 static void
|
1 static void
|
||||||
|
@ -92,14 +92,14 @@ If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set,
|
|||||||
more information is printed with the stall-warning message, for example:
|
more information is printed with the stall-warning message, for example:
|
||||||
|
|
||||||
INFO: rcu_preempt detected stall on CPU
|
INFO: rcu_preempt detected stall on CPU
|
||||||
0: (63959 ticks this GP) idle=241/3fffffffffffffff/0
|
0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543
|
||||||
(t=65000 jiffies)
|
(t=65000 jiffies)
|
||||||
|
|
||||||
In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
|
In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
|
||||||
printed:
|
printed:
|
||||||
|
|
||||||
INFO: rcu_preempt detected stall on CPU
|
INFO: rcu_preempt detected stall on CPU
|
||||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending
|
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
|
||||||
(t=65000 jiffies)
|
(t=65000 jiffies)
|
||||||
|
|
||||||
The "(64628 ticks this GP)" indicates that this CPU has taken more
|
The "(64628 ticks this GP)" indicates that this CPU has taken more
|
||||||
@ -116,13 +116,28 @@ number between the two "/"s is the value of the nesting, which will
|
|||||||
be a small positive number if in the idle loop and a very large positive
|
be a small positive number if in the idle loop and a very large positive
|
||||||
number (as shown above) otherwise.
|
number (as shown above) otherwise.
|
||||||
|
|
||||||
For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is
|
The "softirq=" portion of the message tracks the number of RCU softirq
|
||||||
not in the process of trying to force itself into dyntick-idle state, the
|
handlers that the stalled CPU has executed. The number before the "/"
|
||||||
"." indicates that the CPU has not given up forcing RCU into dyntick-idle
|
is the number that had executed since boot at the time that this CPU
|
||||||
mode (it would be "H" otherwise), and the "timer not pending" indicates
|
last noted the beginning of a grace period, which might be the current
|
||||||
that the CPU has not recently forced RCU into dyntick-idle mode (it
|
(stalled) grace period, or it might be some earlier grace period (for
|
||||||
would otherwise indicate the number of microseconds remaining in this
|
example, if the CPU might have been in dyntick-idle mode for an extended
|
||||||
forced state).
|
time period. The number after the "/" is the number that have executed
|
||||||
|
since boot until the current time. If this latter number stays constant
|
||||||
|
across repeated stall-warning messages, it is possible that RCU's softirq
|
||||||
|
handlers are no longer able to execute on this CPU. This can happen if
|
||||||
|
the stalled CPU is spinning with interrupts are disabled, or, in -rt
|
||||||
|
kernels, if a high-priority process is starving RCU's softirq handler.
|
||||||
|
|
||||||
|
For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the
|
||||||
|
low-order 16 bits (in hex) of the jiffies counter when this CPU last
|
||||||
|
invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked
|
||||||
|
rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:"
|
||||||
|
prints the number of non-lazy callbacks posted since the last call to
|
||||||
|
rcu_needs_cpu(). Finally, an "L" indicates that there are currently
|
||||||
|
no non-lazy callbacks ("." is printed otherwise, as shown above) and
|
||||||
|
"D" indicates that dyntick-idle processing is enabled ("." is printed
|
||||||
|
otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
|
||||||
|
|
||||||
|
|
||||||
Multiple Warnings From One Stall
|
Multiple Warnings From One Stall
|
||||||
|
@ -265,9 +265,9 @@ rcu_dereference()
|
|||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
p = rcu_dereference(head.next);
|
p = rcu_dereference(head.next);
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
x = p->address;
|
x = p->address; /* BUG!!! */
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
y = p->data;
|
y = p->data; /* BUG!!! */
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
|
|
||||||
Holding a reference from one RCU read-side critical section
|
Holding a reference from one RCU read-side critical section
|
||||||
|
@ -2484,9 +2484,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||||||
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
|
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
|
||||||
the specified list of CPUs to be no-callback CPUs.
|
the specified list of CPUs to be no-callback CPUs.
|
||||||
Invocation of these CPUs' RCU callbacks will
|
Invocation of these CPUs' RCU callbacks will
|
||||||
be offloaded to "rcuoN" kthreads created for
|
be offloaded to "rcuox/N" kthreads created for
|
||||||
that purpose. This reduces OS jitter on the
|
that purpose, where "x" is "b" for RCU-bh, "p"
|
||||||
|
for RCU-preempt, and "s" for RCU-sched, and "N"
|
||||||
|
is the CPU number. This reduces OS jitter on the
|
||||||
offloaded CPUs, which can be useful for HPC and
|
offloaded CPUs, which can be useful for HPC and
|
||||||
|
|
||||||
real-time workloads. It can also improve energy
|
real-time workloads. It can also improve energy
|
||||||
efficiency for asymmetric multiprocessors.
|
efficiency for asymmetric multiprocessors.
|
||||||
|
|
||||||
@ -2510,6 +2513,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||||||
leaf rcu_node structure. Useful for very large
|
leaf rcu_node structure. Useful for very large
|
||||||
systems.
|
systems.
|
||||||
|
|
||||||
|
rcutree.jiffies_till_first_fqs= [KNL,BOOT]
|
||||||
|
Set delay from grace-period initialization to
|
||||||
|
first attempt to force quiescent states.
|
||||||
|
Units are jiffies, minimum value is zero,
|
||||||
|
and maximum value is HZ.
|
||||||
|
|
||||||
|
rcutree.jiffies_till_next_fqs= [KNL,BOOT]
|
||||||
|
Set delay between subsequent attempts to force
|
||||||
|
quiescent states. Units are jiffies, minimum
|
||||||
|
value is one, and maximum value is HZ.
|
||||||
|
|
||||||
rcutree.qhimark= [KNL,BOOT]
|
rcutree.qhimark= [KNL,BOOT]
|
||||||
Set threshold of queued
|
Set threshold of queued
|
||||||
RCU callbacks over which batch limiting is disabled.
|
RCU callbacks over which batch limiting is disabled.
|
||||||
@ -2524,16 +2538,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||||||
rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
|
rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
|
||||||
Set timeout for RCU CPU stall warning messages.
|
Set timeout for RCU CPU stall warning messages.
|
||||||
|
|
||||||
rcutree.jiffies_till_first_fqs= [KNL,BOOT]
|
rcutree.rcu_idle_gp_delay= [KNL,BOOT]
|
||||||
Set delay from grace-period initialization to
|
Set wakeup interval for idle CPUs that have
|
||||||
first attempt to force quiescent states.
|
RCU callbacks (RCU_FAST_NO_HZ=y).
|
||||||
Units are jiffies, minimum value is zero,
|
|
||||||
and maximum value is HZ.
|
|
||||||
|
|
||||||
rcutree.jiffies_till_next_fqs= [KNL,BOOT]
|
rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT]
|
||||||
Set delay between subsequent attempts to force
|
Set wakeup interval for idle CPUs that have
|
||||||
quiescent states. Units are jiffies, minimum
|
only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
|
||||||
value is one, and maximum value is HZ.
|
Lazy RCU callbacks are those which RCU can
|
||||||
|
prove do nothing more than free memory.
|
||||||
|
|
||||||
rcutorture.fqs_duration= [KNL,BOOT]
|
rcutorture.fqs_duration= [KNL,BOOT]
|
||||||
Set duration of force_quiescent_state bursts.
|
Set duration of force_quiescent_state bursts.
|
||||||
|
202
Documentation/kernel-per-CPU-kthreads.txt
Normal file
202
Documentation/kernel-per-CPU-kthreads.txt
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
REDUCING OS JITTER DUE TO PER-CPU KTHREADS
|
||||||
|
|
||||||
|
This document lists per-CPU kthreads in the Linux kernel and presents
|
||||||
|
options to control their OS jitter. Note that non-per-CPU kthreads are
|
||||||
|
not listed here. To reduce OS jitter from non-per-CPU kthreads, bind
|
||||||
|
them to a "housekeeping" CPU dedicated to such work.
|
||||||
|
|
||||||
|
|
||||||
|
REFERENCES
|
||||||
|
|
||||||
|
o Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs.
|
||||||
|
|
||||||
|
o Documentation/cgroups: Using cgroups to bind tasks to sets of CPUs.
|
||||||
|
|
||||||
|
o man taskset: Using the taskset command to bind tasks to sets
|
||||||
|
of CPUs.
|
||||||
|
|
||||||
|
o man sched_setaffinity: Using the sched_setaffinity() system
|
||||||
|
call to bind tasks to sets of CPUs.
|
||||||
|
|
||||||
|
o /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state,
|
||||||
|
writing "0" to offline and "1" to online.
|
||||||
|
|
||||||
|
o In order to locate kernel-generated OS jitter on CPU N:
|
||||||
|
|
||||||
|
cd /sys/kernel/debug/tracing
|
||||||
|
echo 1 > max_graph_depth # Increase the "1" for more detail
|
||||||
|
echo function_graph > current_tracer
|
||||||
|
# run workload
|
||||||
|
cat per_cpu/cpuN/trace
|
||||||
|
|
||||||
|
|
||||||
|
KTHREADS
|
||||||
|
|
||||||
|
Name: ehca_comp/%u
|
||||||
|
Purpose: Periodically process Infiniband-related work.
|
||||||
|
To reduce its OS jitter, do any of the following:
|
||||||
|
1. Don't use eHCA Infiniband hardware, instead choosing hardware
|
||||||
|
that does not require per-CPU kthreads. This will prevent these
|
||||||
|
kthreads from being created in the first place. (This will
|
||||||
|
work for most people, as this hardware, though important, is
|
||||||
|
relatively old and is produced in relatively low unit volumes.)
|
||||||
|
2. Do all eHCA-Infiniband-related work on other CPUs, including
|
||||||
|
interrupts.
|
||||||
|
3. Rework the eHCA driver so that its per-CPU kthreads are
|
||||||
|
provisioned only on selected CPUs.
|
||||||
|
|
||||||
|
|
||||||
|
Name: irq/%d-%s
|
||||||
|
Purpose: Handle threaded interrupts.
|
||||||
|
To reduce its OS jitter, do the following:
|
||||||
|
1. Use irq affinity to force the irq threads to execute on
|
||||||
|
some other CPU.
|
||||||
|
|
||||||
|
Name: kcmtpd_ctr_%d
|
||||||
|
Purpose: Handle Bluetooth work.
|
||||||
|
To reduce its OS jitter, do one of the following:
|
||||||
|
1. Don't use Bluetooth, in which case these kthreads won't be
|
||||||
|
created in the first place.
|
||||||
|
2. Use irq affinity to force Bluetooth-related interrupts to
|
||||||
|
occur on some other CPU and furthermore initiate all
|
||||||
|
Bluetooth activity on some other CPU.
|
||||||
|
|
||||||
|
Name: ksoftirqd/%u
|
||||||
|
Purpose: Execute softirq handlers when threaded or when under heavy load.
|
||||||
|
To reduce its OS jitter, each softirq vector must be handled
|
||||||
|
separately as follows:
|
||||||
|
TIMER_SOFTIRQ: Do all of the following:
|
||||||
|
1. To the extent possible, keep the CPU out of the kernel when it
|
||||||
|
is non-idle, for example, by avoiding system calls and by forcing
|
||||||
|
both kernel threads and interrupts to execute elsewhere.
|
||||||
|
2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force
|
||||||
|
the CPU offline, then bring it back online. This forces
|
||||||
|
recurring timers to migrate elsewhere. If you are concerned
|
||||||
|
with multiple CPUs, force them all offline before bringing the
|
||||||
|
first one back online. Once you have onlined the CPUs in question,
|
||||||
|
do not offline any other CPUs, because doing so could force the
|
||||||
|
timer back onto one of the CPUs in question.
|
||||||
|
NET_TX_SOFTIRQ and NET_RX_SOFTIRQ: Do all of the following:
|
||||||
|
1. Force networking interrupts onto other CPUs.
|
||||||
|
2. Initiate any network I/O on other CPUs.
|
||||||
|
3. Once your application has started, prevent CPU-hotplug operations
|
||||||
|
from being initiated from tasks that might run on the CPU to
|
||||||
|
be de-jittered. (It is OK to force this CPU offline and then
|
||||||
|
bring it back online before you start your application.)
|
||||||
|
BLOCK_SOFTIRQ: Do all of the following:
|
||||||
|
1. Force block-device interrupts onto some other CPU.
|
||||||
|
2. Initiate any block I/O on other CPUs.
|
||||||
|
3. Once your application has started, prevent CPU-hotplug operations
|
||||||
|
from being initiated from tasks that might run on the CPU to
|
||||||
|
be de-jittered. (It is OK to force this CPU offline and then
|
||||||
|
bring it back online before you start your application.)
|
||||||
|
BLOCK_IOPOLL_SOFTIRQ: Do all of the following:
|
||||||
|
1. Force block-device interrupts onto some other CPU.
|
||||||
|
2. Initiate any block I/O and block-I/O polling on other CPUs.
|
||||||
|
3. Once your application has started, prevent CPU-hotplug operations
|
||||||
|
from being initiated from tasks that might run on the CPU to
|
||||||
|
be de-jittered. (It is OK to force this CPU offline and then
|
||||||
|
bring it back online before you start your application.)
|
||||||
|
TASKLET_SOFTIRQ: Do one or more of the following:
|
||||||
|
1. Avoid use of drivers that use tasklets. (Such drivers will contain
|
||||||
|
calls to things like tasklet_schedule().)
|
||||||
|
2. Convert all drivers that you must use from tasklets to workqueues.
|
||||||
|
3. Force interrupts for drivers using tasklets onto other CPUs,
|
||||||
|
and also do I/O involving these drivers on other CPUs.
|
||||||
|
SCHED_SOFTIRQ: Do all of the following:
|
||||||
|
1. Avoid sending scheduler IPIs to the CPU to be de-jittered,
|
||||||
|
for example, ensure that at most one runnable kthread is present
|
||||||
|
on that CPU. If a thread that expects to run on the de-jittered
|
||||||
|
CPU awakens, the scheduler will send an IPI that can result in
|
||||||
|
a subsequent SCHED_SOFTIRQ.
|
||||||
|
2. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
|
||||||
|
CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU
|
||||||
|
to be de-jittered is marked as an adaptive-ticks CPU using the
|
||||||
|
"nohz_full=" boot parameter. This reduces the number of
|
||||||
|
scheduler-clock interrupts that the de-jittered CPU receives,
|
||||||
|
minimizing its chances of being selected to do the load balancing
|
||||||
|
work that runs in SCHED_SOFTIRQ context.
|
||||||
|
3. To the extent possible, keep the CPU out of the kernel when it
|
||||||
|
is non-idle, for example, by avoiding system calls and by
|
||||||
|
forcing both kernel threads and interrupts to execute elsewhere.
|
||||||
|
This further reduces the number of scheduler-clock interrupts
|
||||||
|
received by the de-jittered CPU.
|
||||||
|
HRTIMER_SOFTIRQ: Do all of the following:
|
||||||
|
1. To the extent possible, keep the CPU out of the kernel when it
|
||||||
|
is non-idle. For example, avoid system calls and force both
|
||||||
|
kernel threads and interrupts to execute elsewhere.
|
||||||
|
2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the
|
||||||
|
CPU offline, then bring it back online. This forces recurring
|
||||||
|
timers to migrate elsewhere. If you are concerned with multiple
|
||||||
|
CPUs, force them all offline before bringing the first one
|
||||||
|
back online. Once you have onlined the CPUs in question, do not
|
||||||
|
offline any other CPUs, because doing so could force the timer
|
||||||
|
back onto one of the CPUs in question.
|
||||||
|
RCU_SOFTIRQ: Do at least one of the following:
|
||||||
|
1. Offload callbacks and keep the CPU in either dyntick-idle or
|
||||||
|
adaptive-ticks state by doing all of the following:
|
||||||
|
a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
|
||||||
|
CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU
|
||||||
|
to be de-jittered is marked as an adaptive-ticks CPU using
|
||||||
|
the "nohz_full=" boot parameter. Bind the rcuo kthreads
|
||||||
|
to housekeeping CPUs, which can tolerate OS jitter.
|
||||||
|
b. To the extent possible, keep the CPU out of the kernel
|
||||||
|
when it is non-idle, for example, by avoiding system
|
||||||
|
calls and by forcing both kernel threads and interrupts
|
||||||
|
to execute elsewhere.
|
||||||
|
2. Enable RCU to do its processing remotely via dyntick-idle by
|
||||||
|
doing all of the following:
|
||||||
|
a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
|
||||||
|
b. Ensure that the CPU goes idle frequently, allowing other
|
||||||
|
CPUs to detect that it has passed through an RCU quiescent
|
||||||
|
state. If the kernel is built with CONFIG_NO_HZ_FULL=y,
|
||||||
|
userspace execution also allows other CPUs to detect that
|
||||||
|
the CPU in question has passed through a quiescent state.
|
||||||
|
c. To the extent possible, keep the CPU out of the kernel
|
||||||
|
when it is non-idle, for example, by avoiding system
|
||||||
|
calls and by forcing both kernel threads and interrupts
|
||||||
|
to execute elsewhere.
|
||||||
|
|
||||||
|
Name: rcuc/%u
|
||||||
|
Purpose: Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
|
||||||
|
To reduce its OS jitter, do at least one of the following:
|
||||||
|
1. Build the kernel with CONFIG_PREEMPT=n. This prevents these
|
||||||
|
kthreads from being created in the first place, and also obviates
|
||||||
|
the need for RCU priority boosting. This approach is feasible
|
||||||
|
for workloads that do not require high degrees of responsiveness.
|
||||||
|
2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these
|
||||||
|
kthreads from being created in the first place. This approach
|
||||||
|
is feasible only if your workload never requires RCU priority
|
||||||
|
boosting, for example, if you ensure frequent idle time on all
|
||||||
|
CPUs that might execute within the kernel.
|
||||||
|
3. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y,
|
||||||
|
which offloads all RCU callbacks to kthreads that can be moved
|
||||||
|
off of CPUs susceptible to OS jitter. This approach prevents the
|
||||||
|
rcuc/%u kthreads from having any work to do, so that they are
|
||||||
|
never awakened.
|
||||||
|
4. Ensure that the CPU never enters the kernel, and, in particular,
|
||||||
|
avoid initiating any CPU hotplug operations on this CPU. This is
|
||||||
|
another way of preventing any callbacks from being queued on the
|
||||||
|
CPU, again preventing the rcuc/%u kthreads from having any work
|
||||||
|
to do.
|
||||||
|
|
||||||
|
Name: rcuob/%d, rcuop/%d, and rcuos/%d
|
||||||
|
Purpose: Offload RCU callbacks from the corresponding CPU.
|
||||||
|
To reduce its OS jitter, do at least one of the following:
|
||||||
|
1. Use affinity, cgroups, or other mechanism to force these kthreads
|
||||||
|
to execute on some other CPU.
|
||||||
|
2. Build with CONFIG_RCU_NOCB_CPUS=n, which will prevent these
|
||||||
|
kthreads from being created in the first place. However, please
|
||||||
|
note that this will not eliminate OS jitter, but will instead
|
||||||
|
shift it to RCU_SOFTIRQ.
|
||||||
|
|
||||||
|
Name: watchdog/%u
|
||||||
|
Purpose: Detect software lockups on each CPU.
|
||||||
|
To reduce its OS jitter, do at least one of the following:
|
||||||
|
1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
|
||||||
|
kthreads from being created in the first place.
|
||||||
|
2. Echo a zero to /proc/sys/kernel/watchdog to disable the
|
||||||
|
watchdog timer.
|
||||||
|
3. Echo a large number of /proc/sys/kernel/watchdog_thresh in
|
||||||
|
order to reduce the frequency of OS jitter due to the watchdog
|
||||||
|
timer down to a level that is acceptable for your workload.
|
@ -125,6 +125,11 @@ static inline void hlist_bl_unlock(struct hlist_bl_head *b)
|
|||||||
__bit_spin_unlock(0, (unsigned long *)b);
|
__bit_spin_unlock(0, (unsigned long *)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
|
||||||
|
{
|
||||||
|
return bit_spin_is_locked(0, (unsigned long *)b);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* hlist_bl_for_each_entry - iterate over list of given type
|
* hlist_bl_for_each_entry - iterate over list of given type
|
||||||
* @tpos: the type * to use as a loop cursor.
|
* @tpos: the type * to use as a loop cursor.
|
||||||
|
@ -20,7 +20,7 @@ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
|
|||||||
static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
|
static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
|
||||||
{
|
{
|
||||||
return (struct hlist_bl_node *)
|
return (struct hlist_bl_node *)
|
||||||
((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);
|
((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -80,6 +80,7 @@ extern void do_trace_rcu_torture_read(char *rcutorturename,
|
|||||||
#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
|
#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
|
||||||
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
|
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
|
||||||
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
|
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
|
||||||
|
#define ulong2long(a) (*(long *)(&(a)))
|
||||||
|
|
||||||
/* Exported common interfaces */
|
/* Exported common interfaces */
|
||||||
|
|
||||||
|
@ -71,6 +71,58 @@ TRACE_EVENT(rcu_grace_period,
|
|||||||
__entry->rcuname, __entry->gpnum, __entry->gpevent)
|
__entry->rcuname, __entry->gpnum, __entry->gpevent)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Tracepoint for future grace-period events, including those for no-callbacks
|
||||||
|
* CPUs. The caller should pull the data from the rcu_node structure,
|
||||||
|
* other than rcuname, which comes from the rcu_state structure, and event,
|
||||||
|
* which is one of the following:
|
||||||
|
*
|
||||||
|
* "Startleaf": Request a nocb grace period based on leaf-node data.
|
||||||
|
* "Startedleaf": Leaf-node start proved sufficient.
|
||||||
|
* "Startedleafroot": Leaf-node start proved sufficient after checking root.
|
||||||
|
* "Startedroot": Requested a nocb grace period based on root-node data.
|
||||||
|
* "StartWait": Start waiting for the requested grace period.
|
||||||
|
* "ResumeWait": Resume waiting after signal.
|
||||||
|
* "EndWait": Complete wait.
|
||||||
|
* "Cleanup": Clean up rcu_node structure after previous GP.
|
||||||
|
* "CleanupMore": Clean up, and another no-CB GP is needed.
|
||||||
|
*/
|
||||||
|
TRACE_EVENT(rcu_future_grace_period,
|
||||||
|
|
||||||
|
TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed,
|
||||||
|
unsigned long c, u8 level, int grplo, int grphi,
|
||||||
|
char *gpevent),
|
||||||
|
|
||||||
|
TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent),
|
||||||
|
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
__field(char *, rcuname)
|
||||||
|
__field(unsigned long, gpnum)
|
||||||
|
__field(unsigned long, completed)
|
||||||
|
__field(unsigned long, c)
|
||||||
|
__field(u8, level)
|
||||||
|
__field(int, grplo)
|
||||||
|
__field(int, grphi)
|
||||||
|
__field(char *, gpevent)
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_fast_assign(
|
||||||
|
__entry->rcuname = rcuname;
|
||||||
|
__entry->gpnum = gpnum;
|
||||||
|
__entry->completed = completed;
|
||||||
|
__entry->c = c;
|
||||||
|
__entry->level = level;
|
||||||
|
__entry->grplo = grplo;
|
||||||
|
__entry->grphi = grphi;
|
||||||
|
__entry->gpevent = gpevent;
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_printk("%s %lu %lu %lu %u %d %d %s",
|
||||||
|
__entry->rcuname, __entry->gpnum, __entry->completed,
|
||||||
|
__entry->c, __entry->level, __entry->grplo, __entry->grphi,
|
||||||
|
__entry->gpevent)
|
||||||
|
);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Tracepoint for grace-period-initialization events. These are
|
* Tracepoint for grace-period-initialization events. These are
|
||||||
* distinguished by the type of RCU, the new grace-period number, the
|
* distinguished by the type of RCU, the new grace-period number, the
|
||||||
@ -601,6 +653,9 @@ TRACE_EVENT(rcu_barrier,
|
|||||||
#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
|
#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
|
||||||
#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
|
#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
|
||||||
qsmask) do { } while (0)
|
qsmask) do { } while (0)
|
||||||
|
#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
|
||||||
|
level, grplo, grphi, event) \
|
||||||
|
do { } while (0)
|
||||||
#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
|
#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
|
||||||
#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
|
#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
|
||||||
#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \
|
#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \
|
||||||
|
71
init/Kconfig
71
init/Kconfig
@ -578,13 +578,16 @@ config RCU_FAST_NO_HZ
|
|||||||
depends on NO_HZ && SMP
|
depends on NO_HZ && SMP
|
||||||
default n
|
default n
|
||||||
help
|
help
|
||||||
This option causes RCU to attempt to accelerate grace periods in
|
This option permits CPUs to enter dynticks-idle state even if
|
||||||
order to allow CPUs to enter dynticks-idle state more quickly.
|
they have RCU callbacks queued, and prevents RCU from waking
|
||||||
On the other hand, this option increases the overhead of the
|
these CPUs up more than roughly once every four jiffies (by
|
||||||
dynticks-idle checking, thus degrading scheduling latency.
|
default, you can adjust this using the rcutree.rcu_idle_gp_delay
|
||||||
|
parameter), thus improving energy efficiency. On the other
|
||||||
|
hand, this option increases the duration of RCU grace periods,
|
||||||
|
for example, slowing down synchronize_rcu().
|
||||||
|
|
||||||
Say Y if energy efficiency is critically important, and you don't
|
Say Y if energy efficiency is critically important, and you
|
||||||
care about real-time response.
|
don't care about increased grace-period durations.
|
||||||
|
|
||||||
Say N if you are unsure.
|
Say N if you are unsure.
|
||||||
|
|
||||||
@ -651,7 +654,7 @@ config RCU_BOOST_DELAY
|
|||||||
Accept the default if unsure.
|
Accept the default if unsure.
|
||||||
|
|
||||||
config RCU_NOCB_CPU
|
config RCU_NOCB_CPU
|
||||||
bool "Offload RCU callback processing from boot-selected CPUs"
|
bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
|
||||||
depends on TREE_RCU || TREE_PREEMPT_RCU
|
depends on TREE_RCU || TREE_PREEMPT_RCU
|
||||||
default n
|
default n
|
||||||
help
|
help
|
||||||
@ -662,16 +665,56 @@ config RCU_NOCB_CPU
|
|||||||
|
|
||||||
This option offloads callback invocation from the set of
|
This option offloads callback invocation from the set of
|
||||||
CPUs specified at boot time by the rcu_nocbs parameter.
|
CPUs specified at boot time by the rcu_nocbs parameter.
|
||||||
For each such CPU, a kthread ("rcuoN") will be created to
|
For each such CPU, a kthread ("rcuox/N") will be created to
|
||||||
invoke callbacks, where the "N" is the CPU being offloaded.
|
invoke callbacks, where the "N" is the CPU being offloaded,
|
||||||
Nothing prevents this kthread from running on the specified
|
and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
|
||||||
CPUs, but (1) the kthreads may be preempted between each
|
"s" for RCU-sched. Nothing prevents this kthread from running
|
||||||
callback, and (2) affinity or cgroups can be used to force
|
on the specified CPUs, but (1) the kthreads may be preempted
|
||||||
the kthreads to run on whatever set of CPUs is desired.
|
between each callback, and (2) affinity or cgroups can be used
|
||||||
|
to force the kthreads to run on whatever set of CPUs is desired.
|
||||||
|
|
||||||
Say Y here if you want reduced OS jitter on selected CPUs.
|
Say Y here if you want to help to debug reduced OS jitter.
|
||||||
Say N here if you are unsure.
|
Say N here if you are unsure.
|
||||||
|
|
||||||
|
choice
|
||||||
|
prompt "Build-forced no-CBs CPUs"
|
||||||
|
default RCU_NOCB_CPU_NONE
|
||||||
|
help
|
||||||
|
This option allows no-CBs CPUs to be specified at build time.
|
||||||
|
Additional no-CBs CPUs may be specified by the rcu_nocbs=
|
||||||
|
boot parameter.
|
||||||
|
|
||||||
|
config RCU_NOCB_CPU_NONE
|
||||||
|
bool "No build_forced no-CBs CPUs"
|
||||||
|
depends on RCU_NOCB_CPU
|
||||||
|
help
|
||||||
|
This option does not force any of the CPUs to be no-CBs CPUs.
|
||||||
|
Only CPUs designated by the rcu_nocbs= boot parameter will be
|
||||||
|
no-CBs CPUs.
|
||||||
|
|
||||||
|
config RCU_NOCB_CPU_ZERO
|
||||||
|
bool "CPU 0 is a build_forced no-CBs CPU"
|
||||||
|
depends on RCU_NOCB_CPU
|
||||||
|
help
|
||||||
|
This option forces CPU 0 to be a no-CBs CPU. Additional CPUs
|
||||||
|
may be designated as no-CBs CPUs using the rcu_nocbs= boot
|
||||||
|
parameter will be no-CBs CPUs.
|
||||||
|
|
||||||
|
Select this if CPU 0 needs to be a no-CBs CPU for real-time
|
||||||
|
or energy-efficiency reasons.
|
||||||
|
|
||||||
|
config RCU_NOCB_CPU_ALL
|
||||||
|
bool "All CPUs are build_forced no-CBs CPUs"
|
||||||
|
depends on RCU_NOCB_CPU
|
||||||
|
help
|
||||||
|
This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs=
|
||||||
|
boot parameter will be ignored.
|
||||||
|
|
||||||
|
Select this if all CPUs need to be no-CBs CPUs for real-time
|
||||||
|
or energy-efficiency reasons.
|
||||||
|
|
||||||
|
endchoice
|
||||||
|
|
||||||
endmenu # "RCU Subsystem"
|
endmenu # "RCU Subsystem"
|
||||||
|
|
||||||
config IKCONFIG
|
config IKCONFIG
|
||||||
|
260
kernel/rcutree.c
260
kernel/rcutree.c
@ -64,7 +64,7 @@
|
|||||||
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
|
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
|
||||||
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
|
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
|
||||||
|
|
||||||
#define RCU_STATE_INITIALIZER(sname, cr) { \
|
#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
|
||||||
.level = { &sname##_state.node[0] }, \
|
.level = { &sname##_state.node[0] }, \
|
||||||
.call = cr, \
|
.call = cr, \
|
||||||
.fqs_state = RCU_GP_IDLE, \
|
.fqs_state = RCU_GP_IDLE, \
|
||||||
@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
|
|||||||
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
|
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
|
||||||
.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
|
.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
|
||||||
.name = #sname, \
|
.name = #sname, \
|
||||||
|
.abbr = sabbr, \
|
||||||
}
|
}
|
||||||
|
|
||||||
struct rcu_state rcu_sched_state =
|
struct rcu_state rcu_sched_state =
|
||||||
RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
|
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
|
||||||
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
|
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
|
||||||
|
|
||||||
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
|
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
|
||||||
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
|
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
|
||||||
|
|
||||||
static struct rcu_state *rcu_state;
|
static struct rcu_state *rcu_state;
|
||||||
@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
|
|||||||
module_param(jiffies_till_first_fqs, ulong, 0644);
|
module_param(jiffies_till_first_fqs, ulong, 0644);
|
||||||
module_param(jiffies_till_next_fqs, ulong, 0644);
|
module_param(jiffies_till_next_fqs, ulong, 0644);
|
||||||
|
|
||||||
|
static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
|
||||||
|
struct rcu_data *rdp);
|
||||||
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
|
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
|
||||||
static void force_quiescent_state(struct rcu_state *rsp);
|
static void force_quiescent_state(struct rcu_state *rsp);
|
||||||
static int rcu_pending(int cpu);
|
static int rcu_pending(int cpu);
|
||||||
@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
|
|||||||
|
|
||||||
if (rcu_gp_in_progress(rsp))
|
if (rcu_gp_in_progress(rsp))
|
||||||
return 0; /* No, a grace period is already in progress. */
|
return 0; /* No, a grace period is already in progress. */
|
||||||
|
if (rcu_nocb_needs_gp(rsp))
|
||||||
|
return 1; /* Yes, a no-CBs CPU needs one. */
|
||||||
if (!rdp->nxttail[RCU_NEXT_TAIL])
|
if (!rdp->nxttail[RCU_NEXT_TAIL])
|
||||||
return 0; /* No, this is a no-CBs (or offline) CPU. */
|
return 0; /* No, this is a no-CBs (or offline) CPU. */
|
||||||
if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
|
if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
|
||||||
@ -1035,10 +1040,11 @@ static void init_callback_list(struct rcu_data *rdp)
|
|||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
if (init_nocb_callback_list(rdp))
|
||||||
|
return;
|
||||||
rdp->nxtlist = NULL;
|
rdp->nxtlist = NULL;
|
||||||
for (i = 0; i < RCU_NEXT_SIZE; i++)
|
for (i = 0; i < RCU_NEXT_SIZE; i++)
|
||||||
rdp->nxttail[i] = &rdp->nxtlist;
|
rdp->nxttail[i] = &rdp->nxtlist;
|
||||||
init_nocb_callback_list(rdp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1070,6 +1076,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
|
|||||||
return rnp->completed + 2;
|
return rnp->completed + 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Trace-event helper function for rcu_start_future_gp() and
|
||||||
|
* rcu_nocb_wait_gp().
|
||||||
|
*/
|
||||||
|
static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
|
||||||
|
unsigned long c, char *s)
|
||||||
|
{
|
||||||
|
trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
|
||||||
|
rnp->completed, c, rnp->level,
|
||||||
|
rnp->grplo, rnp->grphi, s);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Start some future grace period, as needed to handle newly arrived
|
||||||
|
* callbacks. The required future grace periods are recorded in each
|
||||||
|
* rcu_node structure's ->need_future_gp field.
|
||||||
|
*
|
||||||
|
* The caller must hold the specified rcu_node structure's ->lock.
|
||||||
|
*/
|
||||||
|
static unsigned long __maybe_unused
|
||||||
|
rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
|
||||||
|
{
|
||||||
|
unsigned long c;
|
||||||
|
int i;
|
||||||
|
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Pick up grace-period number for new callbacks. If this
|
||||||
|
* grace period is already marked as needed, return to the caller.
|
||||||
|
*/
|
||||||
|
c = rcu_cbs_completed(rdp->rsp, rnp);
|
||||||
|
trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
|
||||||
|
if (rnp->need_future_gp[c & 0x1]) {
|
||||||
|
trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If either this rcu_node structure or the root rcu_node structure
|
||||||
|
* believe that a grace period is in progress, then we must wait
|
||||||
|
* for the one following, which is in "c". Because our request
|
||||||
|
* will be noticed at the end of the current grace period, we don't
|
||||||
|
* need to explicitly start one.
|
||||||
|
*/
|
||||||
|
if (rnp->gpnum != rnp->completed ||
|
||||||
|
ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
|
||||||
|
rnp->need_future_gp[c & 0x1]++;
|
||||||
|
trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There might be no grace period in progress. If we don't already
|
||||||
|
* hold it, acquire the root rcu_node structure's lock in order to
|
||||||
|
* start one (if needed).
|
||||||
|
*/
|
||||||
|
if (rnp != rnp_root)
|
||||||
|
raw_spin_lock(&rnp_root->lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get a new grace-period number. If there really is no grace
|
||||||
|
* period in progress, it will be smaller than the one we obtained
|
||||||
|
* earlier. Adjust callbacks as needed. Note that even no-CBs
|
||||||
|
* CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
|
||||||
|
*/
|
||||||
|
c = rcu_cbs_completed(rdp->rsp, rnp_root);
|
||||||
|
for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
|
||||||
|
if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
|
||||||
|
rdp->nxtcompleted[i] = c;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the needed for the required grace period is already
|
||||||
|
* recorded, trace and leave.
|
||||||
|
*/
|
||||||
|
if (rnp_root->need_future_gp[c & 0x1]) {
|
||||||
|
trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
|
||||||
|
goto unlock_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Record the need for the future grace period. */
|
||||||
|
rnp_root->need_future_gp[c & 0x1]++;
|
||||||
|
|
||||||
|
/* If a grace period is not already in progress, start one. */
|
||||||
|
if (rnp_root->gpnum != rnp_root->completed) {
|
||||||
|
trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
|
||||||
|
} else {
|
||||||
|
trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
|
||||||
|
rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
|
||||||
|
}
|
||||||
|
unlock_out:
|
||||||
|
if (rnp != rnp_root)
|
||||||
|
raw_spin_unlock(&rnp_root->lock);
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Clean up any old requests for the just-ended grace period. Also return
|
||||||
|
* whether any additional grace periods have been requested. Also invoke
|
||||||
|
* rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
|
||||||
|
* waiting for this grace period to complete.
|
||||||
|
*/
|
||||||
|
static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
|
||||||
|
{
|
||||||
|
int c = rnp->completed;
|
||||||
|
int needmore;
|
||||||
|
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
|
||||||
|
|
||||||
|
rcu_nocb_gp_cleanup(rsp, rnp);
|
||||||
|
rnp->need_future_gp[c & 0x1] = 0;
|
||||||
|
needmore = rnp->need_future_gp[(c + 1) & 0x1];
|
||||||
|
trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
|
||||||
|
return needmore;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If there is room, assign a ->completed number to any callbacks on
|
* If there is room, assign a ->completed number to any callbacks on
|
||||||
* this CPU that have not already been assigned. Also accelerate any
|
* this CPU that have not already been assigned. Also accelerate any
|
||||||
@ -1129,6 +1249,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
|
|||||||
rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
|
rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
|
||||||
rdp->nxtcompleted[i] = c;
|
rdp->nxtcompleted[i] = c;
|
||||||
}
|
}
|
||||||
|
/* Record any needed additional grace periods. */
|
||||||
|
rcu_start_future_gp(rnp, rdp);
|
||||||
|
|
||||||
/* Trace depending on how much we were able to accelerate. */
|
/* Trace depending on how much we were able to accelerate. */
|
||||||
if (!*rdp->nxttail[RCU_WAIT_TAIL])
|
if (!*rdp->nxttail[RCU_WAIT_TAIL])
|
||||||
@ -1308,9 +1430,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
|
|||||||
rdp = this_cpu_ptr(rsp->rda);
|
rdp = this_cpu_ptr(rsp->rda);
|
||||||
rcu_preempt_check_blocked_tasks(rnp);
|
rcu_preempt_check_blocked_tasks(rnp);
|
||||||
rnp->qsmask = rnp->qsmaskinit;
|
rnp->qsmask = rnp->qsmaskinit;
|
||||||
rnp->gpnum = rsp->gpnum;
|
ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
|
||||||
WARN_ON_ONCE(rnp->completed != rsp->completed);
|
WARN_ON_ONCE(rnp->completed != rsp->completed);
|
||||||
rnp->completed = rsp->completed;
|
ACCESS_ONCE(rnp->completed) = rsp->completed;
|
||||||
if (rnp == rdp->mynode)
|
if (rnp == rdp->mynode)
|
||||||
rcu_start_gp_per_cpu(rsp, rnp, rdp);
|
rcu_start_gp_per_cpu(rsp, rnp, rdp);
|
||||||
rcu_preempt_boost_start_gp(rnp);
|
rcu_preempt_boost_start_gp(rnp);
|
||||||
@ -1319,7 +1441,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
|
|||||||
rnp->grphi, rnp->qsmask);
|
rnp->grphi, rnp->qsmask);
|
||||||
raw_spin_unlock_irq(&rnp->lock);
|
raw_spin_unlock_irq(&rnp->lock);
|
||||||
#ifdef CONFIG_PROVE_RCU_DELAY
|
#ifdef CONFIG_PROVE_RCU_DELAY
|
||||||
if ((random32() % (rcu_num_nodes * 8)) == 0)
|
if ((random32() % (rcu_num_nodes * 8)) == 0 &&
|
||||||
|
system_state == SYSTEM_RUNNING)
|
||||||
schedule_timeout_uninterruptible(2);
|
schedule_timeout_uninterruptible(2);
|
||||||
#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
|
#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
|
||||||
cond_resched();
|
cond_resched();
|
||||||
@ -1361,6 +1484,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
|
|||||||
static void rcu_gp_cleanup(struct rcu_state *rsp)
|
static void rcu_gp_cleanup(struct rcu_state *rsp)
|
||||||
{
|
{
|
||||||
unsigned long gp_duration;
|
unsigned long gp_duration;
|
||||||
|
int nocb = 0;
|
||||||
struct rcu_data *rdp;
|
struct rcu_data *rdp;
|
||||||
struct rcu_node *rnp = rcu_get_root(rsp);
|
struct rcu_node *rnp = rcu_get_root(rsp);
|
||||||
|
|
||||||
@ -1390,17 +1514,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
|
|||||||
*/
|
*/
|
||||||
rcu_for_each_node_breadth_first(rsp, rnp) {
|
rcu_for_each_node_breadth_first(rsp, rnp) {
|
||||||
raw_spin_lock_irq(&rnp->lock);
|
raw_spin_lock_irq(&rnp->lock);
|
||||||
rnp->completed = rsp->gpnum;
|
ACCESS_ONCE(rnp->completed) = rsp->gpnum;
|
||||||
|
rdp = this_cpu_ptr(rsp->rda);
|
||||||
|
if (rnp == rdp->mynode)
|
||||||
|
__rcu_process_gp_end(rsp, rnp, rdp);
|
||||||
|
nocb += rcu_future_gp_cleanup(rsp, rnp);
|
||||||
raw_spin_unlock_irq(&rnp->lock);
|
raw_spin_unlock_irq(&rnp->lock);
|
||||||
cond_resched();
|
cond_resched();
|
||||||
}
|
}
|
||||||
rnp = rcu_get_root(rsp);
|
rnp = rcu_get_root(rsp);
|
||||||
raw_spin_lock_irq(&rnp->lock);
|
raw_spin_lock_irq(&rnp->lock);
|
||||||
|
rcu_nocb_gp_set(rnp, nocb);
|
||||||
|
|
||||||
rsp->completed = rsp->gpnum; /* Declare grace period done. */
|
rsp->completed = rsp->gpnum; /* Declare grace period done. */
|
||||||
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
|
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
|
||||||
rsp->fqs_state = RCU_GP_IDLE;
|
rsp->fqs_state = RCU_GP_IDLE;
|
||||||
rdp = this_cpu_ptr(rsp->rda);
|
rdp = this_cpu_ptr(rsp->rda);
|
||||||
|
rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
|
||||||
if (cpu_needs_another_gp(rsp, rdp))
|
if (cpu_needs_another_gp(rsp, rdp))
|
||||||
rsp->gp_flags = 1;
|
rsp->gp_flags = 1;
|
||||||
raw_spin_unlock_irq(&rnp->lock);
|
raw_spin_unlock_irq(&rnp->lock);
|
||||||
@ -1476,57 +1606,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
|
|||||||
/*
|
/*
|
||||||
* Start a new RCU grace period if warranted, re-initializing the hierarchy
|
* Start a new RCU grace period if warranted, re-initializing the hierarchy
|
||||||
* in preparation for detecting the next grace period. The caller must hold
|
* in preparation for detecting the next grace period. The caller must hold
|
||||||
* the root node's ->lock, which is released before return. Hard irqs must
|
* the root node's ->lock and hard irqs must be disabled.
|
||||||
* be disabled.
|
|
||||||
*
|
*
|
||||||
* Note that it is legal for a dying CPU (which is marked as offline) to
|
* Note that it is legal for a dying CPU (which is marked as offline) to
|
||||||
* invoke this function. This can happen when the dying CPU reports its
|
* invoke this function. This can happen when the dying CPU reports its
|
||||||
* quiescent state.
|
* quiescent state.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
|
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
|
||||||
__releases(rcu_get_root(rsp)->lock)
|
struct rcu_data *rdp)
|
||||||
{
|
{
|
||||||
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
|
if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
|
||||||
struct rcu_node *rnp = rcu_get_root(rsp);
|
|
||||||
|
|
||||||
if (!rsp->gp_kthread ||
|
|
||||||
!cpu_needs_another_gp(rsp, rdp)) {
|
|
||||||
/*
|
/*
|
||||||
* Either we have not yet spawned the grace-period
|
* Either we have not yet spawned the grace-period
|
||||||
* task, this CPU does not need another grace period,
|
* task, this CPU does not need another grace period,
|
||||||
* or a grace period is already in progress.
|
* or a grace period is already in progress.
|
||||||
* Either way, don't start a new grace period.
|
* Either way, don't start a new grace period.
|
||||||
*/
|
*/
|
||||||
raw_spin_unlock_irqrestore(&rnp->lock, flags);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Because there is no grace period in progress right now,
|
|
||||||
* any callbacks we have up to this point will be satisfied
|
|
||||||
* by the next grace period. So this is a good place to
|
|
||||||
* assign a grace period number to recently posted callbacks.
|
|
||||||
*/
|
|
||||||
rcu_accelerate_cbs(rsp, rnp, rdp);
|
|
||||||
|
|
||||||
rsp->gp_flags = RCU_GP_FLAG_INIT;
|
rsp->gp_flags = RCU_GP_FLAG_INIT;
|
||||||
raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
|
|
||||||
|
|
||||||
/* Ensure that CPU is aware of completion of last grace period. */
|
|
||||||
rcu_process_gp_end(rsp, rdp);
|
|
||||||
local_irq_restore(flags);
|
|
||||||
|
|
||||||
/* Wake up rcu_gp_kthread() to start the grace period. */
|
/* Wake up rcu_gp_kthread() to start the grace period. */
|
||||||
wake_up(&rsp->gp_wq);
|
wake_up(&rsp->gp_wq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
|
||||||
|
* callbacks. Note that rcu_start_gp_advanced() cannot do this because it
|
||||||
|
* is invoked indirectly from rcu_advance_cbs(), which would result in
|
||||||
|
* endless recursion -- or would do so if it wasn't for the self-deadlock
|
||||||
|
* that is encountered beforehand.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
rcu_start_gp(struct rcu_state *rsp)
|
||||||
|
{
|
||||||
|
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
|
||||||
|
struct rcu_node *rnp = rcu_get_root(rsp);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there is no grace period in progress right now, any
|
||||||
|
* callbacks we have up to this point will be satisfied by the
|
||||||
|
* next grace period. Also, advancing the callbacks reduces the
|
||||||
|
* probability of false positives from cpu_needs_another_gp()
|
||||||
|
* resulting in pointless grace periods. So, advance callbacks
|
||||||
|
* then start the grace period!
|
||||||
|
*/
|
||||||
|
rcu_advance_cbs(rsp, rnp, rdp);
|
||||||
|
rcu_start_gp_advanced(rsp, rnp, rdp);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Report a full set of quiescent states to the specified rcu_state
|
* Report a full set of quiescent states to the specified rcu_state
|
||||||
* data structure. This involves cleaning up after the prior grace
|
* data structure. This involves cleaning up after the prior grace
|
||||||
* period and letting rcu_start_gp() start up the next grace period
|
* period and letting rcu_start_gp() start up the next grace period
|
||||||
* if one is needed. Note that the caller must hold rnp->lock, as
|
* if one is needed. Note that the caller must hold rnp->lock, which
|
||||||
* required by rcu_start_gp(), which will release it.
|
* is released before return.
|
||||||
*/
|
*/
|
||||||
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
|
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
|
||||||
__releases(rcu_get_root(rsp)->lock)
|
__releases(rcu_get_root(rsp)->lock)
|
||||||
@ -2124,7 +2259,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)
|
|||||||
local_irq_save(flags);
|
local_irq_save(flags);
|
||||||
if (cpu_needs_another_gp(rsp, rdp)) {
|
if (cpu_needs_another_gp(rsp, rdp)) {
|
||||||
raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
|
raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
|
||||||
rcu_start_gp(rsp, flags); /* releases above lock */
|
rcu_start_gp(rsp);
|
||||||
|
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
|
||||||
} else {
|
} else {
|
||||||
local_irq_restore(flags);
|
local_irq_restore(flags);
|
||||||
}
|
}
|
||||||
@ -2169,7 +2305,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
|
|||||||
|
|
||||||
static void invoke_rcu_core(void)
|
static void invoke_rcu_core(void)
|
||||||
{
|
{
|
||||||
raise_softirq(RCU_SOFTIRQ);
|
if (cpu_online(smp_processor_id()))
|
||||||
|
raise_softirq(RCU_SOFTIRQ);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2204,11 +2341,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
|
|||||||
|
|
||||||
/* Start a new grace period if one not already started. */
|
/* Start a new grace period if one not already started. */
|
||||||
if (!rcu_gp_in_progress(rsp)) {
|
if (!rcu_gp_in_progress(rsp)) {
|
||||||
unsigned long nestflag;
|
|
||||||
struct rcu_node *rnp_root = rcu_get_root(rsp);
|
struct rcu_node *rnp_root = rcu_get_root(rsp);
|
||||||
|
|
||||||
raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
|
raw_spin_lock(&rnp_root->lock);
|
||||||
rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
|
rcu_start_gp(rsp);
|
||||||
|
raw_spin_unlock(&rnp_root->lock);
|
||||||
} else {
|
} else {
|
||||||
/* Give the grace period a kick. */
|
/* Give the grace period a kick. */
|
||||||
rdp->blimit = LONG_MAX;
|
rdp->blimit = LONG_MAX;
|
||||||
@ -2628,19 +2765,27 @@ static int rcu_pending(int cpu)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check to see if any future RCU-related work will need to be done
|
* Return true if the specified CPU has any callback. If all_lazy is
|
||||||
* by the current CPU, even if none need be done immediately, returning
|
* non-NULL, store an indication of whether all callbacks are lazy.
|
||||||
* 1 if so.
|
* (If there are no callbacks, all of them are deemed to be lazy.)
|
||||||
*/
|
*/
|
||||||
static int rcu_cpu_has_callbacks(int cpu)
|
static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
|
||||||
{
|
{
|
||||||
|
bool al = true;
|
||||||
|
bool hc = false;
|
||||||
|
struct rcu_data *rdp;
|
||||||
struct rcu_state *rsp;
|
struct rcu_state *rsp;
|
||||||
|
|
||||||
/* RCU callbacks either ready or pending? */
|
for_each_rcu_flavor(rsp) {
|
||||||
for_each_rcu_flavor(rsp)
|
rdp = per_cpu_ptr(rsp->rda, cpu);
|
||||||
if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
|
if (rdp->qlen != rdp->qlen_lazy)
|
||||||
return 1;
|
al = false;
|
||||||
return 0;
|
if (rdp->nxtlist)
|
||||||
|
hc = true;
|
||||||
|
}
|
||||||
|
if (all_lazy)
|
||||||
|
*all_lazy = al;
|
||||||
|
return hc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2859,7 +3004,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
|
|||||||
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
|
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
|
||||||
atomic_set(&rdp->dynticks->dynticks,
|
atomic_set(&rdp->dynticks->dynticks,
|
||||||
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
|
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
|
||||||
rcu_prepare_for_idle_init(cpu);
|
|
||||||
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
|
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
|
||||||
|
|
||||||
/* Add CPU to rcu_node bitmasks. */
|
/* Add CPU to rcu_node bitmasks. */
|
||||||
@ -2909,7 +3053,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
|
|||||||
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
|
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
|
||||||
struct rcu_node *rnp = rdp->mynode;
|
struct rcu_node *rnp = rdp->mynode;
|
||||||
struct rcu_state *rsp;
|
struct rcu_state *rsp;
|
||||||
int ret = NOTIFY_OK;
|
|
||||||
|
|
||||||
trace_rcu_utilization("Start CPU hotplug");
|
trace_rcu_utilization("Start CPU hotplug");
|
||||||
switch (action) {
|
switch (action) {
|
||||||
@ -2923,21 +3066,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
|
|||||||
rcu_boost_kthread_setaffinity(rnp, -1);
|
rcu_boost_kthread_setaffinity(rnp, -1);
|
||||||
break;
|
break;
|
||||||
case CPU_DOWN_PREPARE:
|
case CPU_DOWN_PREPARE:
|
||||||
if (nocb_cpu_expendable(cpu))
|
rcu_boost_kthread_setaffinity(rnp, cpu);
|
||||||
rcu_boost_kthread_setaffinity(rnp, cpu);
|
|
||||||
else
|
|
||||||
ret = NOTIFY_BAD;
|
|
||||||
break;
|
break;
|
||||||
case CPU_DYING:
|
case CPU_DYING:
|
||||||
case CPU_DYING_FROZEN:
|
case CPU_DYING_FROZEN:
|
||||||
/*
|
|
||||||
* The whole machine is "stopped" except this CPU, so we can
|
|
||||||
* touch any data without introducing corruption. We send the
|
|
||||||
* dying CPU's callbacks to an arbitrarily chosen online CPU.
|
|
||||||
*/
|
|
||||||
for_each_rcu_flavor(rsp)
|
for_each_rcu_flavor(rsp)
|
||||||
rcu_cleanup_dying_cpu(rsp);
|
rcu_cleanup_dying_cpu(rsp);
|
||||||
rcu_cleanup_after_idle(cpu);
|
|
||||||
break;
|
break;
|
||||||
case CPU_DEAD:
|
case CPU_DEAD:
|
||||||
case CPU_DEAD_FROZEN:
|
case CPU_DEAD_FROZEN:
|
||||||
@ -2950,7 +3084,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
trace_rcu_utilization("End CPU hotplug");
|
trace_rcu_utilization("End CPU hotplug");
|
||||||
return ret;
|
return NOTIFY_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -3085,6 +3219,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
|
|||||||
}
|
}
|
||||||
rnp->level = i;
|
rnp->level = i;
|
||||||
INIT_LIST_HEAD(&rnp->blkd_tasks);
|
INIT_LIST_HEAD(&rnp->blkd_tasks);
|
||||||
|
rcu_init_one_nocb(rnp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3170,8 +3305,7 @@ void __init rcu_init(void)
|
|||||||
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
|
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
|
||||||
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
|
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
|
||||||
__rcu_init_preempt();
|
__rcu_init_preempt();
|
||||||
rcu_init_nocb();
|
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
|
||||||
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We don't need protection against CPU-hotplug here because
|
* We don't need protection against CPU-hotplug here because
|
||||||
|
@ -88,18 +88,13 @@ struct rcu_dynticks {
|
|||||||
int dynticks_nmi_nesting; /* Track NMI nesting level. */
|
int dynticks_nmi_nesting; /* Track NMI nesting level. */
|
||||||
atomic_t dynticks; /* Even value for idle, else odd. */
|
atomic_t dynticks; /* Even value for idle, else odd. */
|
||||||
#ifdef CONFIG_RCU_FAST_NO_HZ
|
#ifdef CONFIG_RCU_FAST_NO_HZ
|
||||||
int dyntick_drain; /* Prepare-for-idle state variable. */
|
bool all_lazy; /* Are all CPU's CBs lazy? */
|
||||||
unsigned long dyntick_holdoff;
|
|
||||||
/* No retries for the jiffy of failure. */
|
|
||||||
struct timer_list idle_gp_timer;
|
|
||||||
/* Wake up CPU sleeping with callbacks. */
|
|
||||||
unsigned long idle_gp_timer_expires;
|
|
||||||
/* When to wake up CPU (for repost). */
|
|
||||||
bool idle_first_pass; /* First pass of attempt to go idle? */
|
|
||||||
unsigned long nonlazy_posted;
|
unsigned long nonlazy_posted;
|
||||||
/* # times non-lazy CBs posted to CPU. */
|
/* # times non-lazy CBs posted to CPU. */
|
||||||
unsigned long nonlazy_posted_snap;
|
unsigned long nonlazy_posted_snap;
|
||||||
/* idle-period nonlazy_posted snapshot. */
|
/* idle-period nonlazy_posted snapshot. */
|
||||||
|
unsigned long last_accelerate;
|
||||||
|
/* Last jiffy CBs were accelerated. */
|
||||||
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
|
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
|
||||||
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
|
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
|
||||||
};
|
};
|
||||||
@ -134,9 +129,6 @@ struct rcu_node {
|
|||||||
/* elements that need to drain to allow the */
|
/* elements that need to drain to allow the */
|
||||||
/* current expedited grace period to */
|
/* current expedited grace period to */
|
||||||
/* complete (only for TREE_PREEMPT_RCU). */
|
/* complete (only for TREE_PREEMPT_RCU). */
|
||||||
atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
|
|
||||||
/* Since this has meaning only for leaf */
|
|
||||||
/* rcu_node structures, 32 bits suffices. */
|
|
||||||
unsigned long qsmaskinit;
|
unsigned long qsmaskinit;
|
||||||
/* Per-GP initial value for qsmask & expmask. */
|
/* Per-GP initial value for qsmask & expmask. */
|
||||||
unsigned long grpmask; /* Mask to apply to parent qsmask. */
|
unsigned long grpmask; /* Mask to apply to parent qsmask. */
|
||||||
@ -196,6 +188,12 @@ struct rcu_node {
|
|||||||
/* Refused to boost: not sure why, though. */
|
/* Refused to boost: not sure why, though. */
|
||||||
/* This can happen due to race conditions. */
|
/* This can happen due to race conditions. */
|
||||||
#endif /* #ifdef CONFIG_RCU_BOOST */
|
#endif /* #ifdef CONFIG_RCU_BOOST */
|
||||||
|
#ifdef CONFIG_RCU_NOCB_CPU
|
||||||
|
wait_queue_head_t nocb_gp_wq[2];
|
||||||
|
/* Place for rcu_nocb_kthread() to wait GP. */
|
||||||
|
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
|
||||||
|
int need_future_gp[2];
|
||||||
|
/* Counts of upcoming no-CB GP requests. */
|
||||||
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
|
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
|
||||||
} ____cacheline_internodealigned_in_smp;
|
} ____cacheline_internodealigned_in_smp;
|
||||||
|
|
||||||
@ -328,6 +326,11 @@ struct rcu_data {
|
|||||||
struct task_struct *nocb_kthread;
|
struct task_struct *nocb_kthread;
|
||||||
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
|
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
|
||||||
|
|
||||||
|
/* 8) RCU CPU stall data. */
|
||||||
|
#ifdef CONFIG_RCU_CPU_STALL_INFO
|
||||||
|
unsigned int softirq_snap; /* Snapshot of softirq activity. */
|
||||||
|
#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
|
||||||
|
|
||||||
int cpu;
|
int cpu;
|
||||||
struct rcu_state *rsp;
|
struct rcu_state *rsp;
|
||||||
};
|
};
|
||||||
@ -375,12 +378,6 @@ struct rcu_state {
|
|||||||
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
|
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
|
||||||
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
|
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
|
||||||
void (*func)(struct rcu_head *head));
|
void (*func)(struct rcu_head *head));
|
||||||
#ifdef CONFIG_RCU_NOCB_CPU
|
|
||||||
void (*call_remote)(struct rcu_head *head,
|
|
||||||
void (*func)(struct rcu_head *head));
|
|
||||||
/* call_rcu() flavor, but for */
|
|
||||||
/* placing on remote CPU. */
|
|
||||||
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
|
|
||||||
|
|
||||||
/* The following fields are guarded by the root rcu_node's lock. */
|
/* The following fields are guarded by the root rcu_node's lock. */
|
||||||
|
|
||||||
@ -443,6 +440,7 @@ struct rcu_state {
|
|||||||
unsigned long gp_max; /* Maximum GP duration in */
|
unsigned long gp_max; /* Maximum GP duration in */
|
||||||
/* jiffies. */
|
/* jiffies. */
|
||||||
char *name; /* Name of structure. */
|
char *name; /* Name of structure. */
|
||||||
|
char abbr; /* Abbreviated name. */
|
||||||
struct list_head flavors; /* List of RCU flavors. */
|
struct list_head flavors; /* List of RCU flavors. */
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
|
|||||||
struct rcu_node *rnp);
|
struct rcu_node *rnp);
|
||||||
#endif /* #ifdef CONFIG_RCU_BOOST */
|
#endif /* #ifdef CONFIG_RCU_BOOST */
|
||||||
static void __cpuinit rcu_prepare_kthreads(int cpu);
|
static void __cpuinit rcu_prepare_kthreads(int cpu);
|
||||||
static void rcu_prepare_for_idle_init(int cpu);
|
|
||||||
static void rcu_cleanup_after_idle(int cpu);
|
static void rcu_cleanup_after_idle(int cpu);
|
||||||
static void rcu_prepare_for_idle(int cpu);
|
static void rcu_prepare_for_idle(int cpu);
|
||||||
static void rcu_idle_count_callbacks_posted(void);
|
static void rcu_idle_count_callbacks_posted(void);
|
||||||
@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
|
|||||||
static void print_cpu_stall_info_end(void);
|
static void print_cpu_stall_info_end(void);
|
||||||
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
|
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
|
||||||
static void increment_cpu_stall_ticks(void);
|
static void increment_cpu_stall_ticks(void);
|
||||||
|
static int rcu_nocb_needs_gp(struct rcu_state *rsp);
|
||||||
|
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
|
||||||
|
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
|
||||||
|
static void rcu_init_one_nocb(struct rcu_node *rnp);
|
||||||
static bool is_nocb_cpu(int cpu);
|
static bool is_nocb_cpu(int cpu);
|
||||||
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
|
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
|
||||||
bool lazy);
|
bool lazy);
|
||||||
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
|
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
|
||||||
struct rcu_data *rdp);
|
struct rcu_data *rdp);
|
||||||
static bool nocb_cpu_expendable(int cpu);
|
|
||||||
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
|
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
|
||||||
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
|
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
|
||||||
static void init_nocb_callback_list(struct rcu_data *rdp);
|
static bool init_nocb_callback_list(struct rcu_data *rdp);
|
||||||
static void __init rcu_init_nocb(void);
|
|
||||||
|
|
||||||
#endif /* #ifndef RCU_TREE_NONCORE */
|
#endif /* #ifndef RCU_TREE_NONCORE */
|
||||||
|
|
||||||
|
@ -85,11 +85,21 @@ static void __init rcu_bootup_announce_oddness(void)
|
|||||||
if (nr_cpu_ids != NR_CPUS)
|
if (nr_cpu_ids != NR_CPUS)
|
||||||
printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
|
printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
|
||||||
#ifdef CONFIG_RCU_NOCB_CPU
|
#ifdef CONFIG_RCU_NOCB_CPU
|
||||||
|
#ifndef CONFIG_RCU_NOCB_CPU_NONE
|
||||||
|
if (!have_rcu_nocb_mask) {
|
||||||
|
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
|
||||||
|
have_rcu_nocb_mask = true;
|
||||||
|
}
|
||||||
|
#ifdef CONFIG_RCU_NOCB_CPU_ZERO
|
||||||
|
pr_info("\tExperimental no-CBs CPU 0\n");
|
||||||
|
cpumask_set_cpu(0, rcu_nocb_mask);
|
||||||
|
#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
|
||||||
|
#ifdef CONFIG_RCU_NOCB_CPU_ALL
|
||||||
|
pr_info("\tExperimental no-CBs for all CPUs\n");
|
||||||
|
cpumask_setall(rcu_nocb_mask);
|
||||||
|
#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
|
||||||
|
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
|
||||||
if (have_rcu_nocb_mask) {
|
if (have_rcu_nocb_mask) {
|
||||||
if (cpumask_test_cpu(0, rcu_nocb_mask)) {
|
|
||||||
cpumask_clear_cpu(0, rcu_nocb_mask);
|
|
||||||
pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
|
|
||||||
}
|
|
||||||
cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
|
cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
|
||||||
pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
|
pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
|
||||||
if (rcu_nocb_poll)
|
if (rcu_nocb_poll)
|
||||||
@ -101,7 +111,7 @@ static void __init rcu_bootup_announce_oddness(void)
|
|||||||
#ifdef CONFIG_TREE_PREEMPT_RCU
|
#ifdef CONFIG_TREE_PREEMPT_RCU
|
||||||
|
|
||||||
struct rcu_state rcu_preempt_state =
|
struct rcu_state rcu_preempt_state =
|
||||||
RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
|
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
|
||||||
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
|
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
|
||||||
static struct rcu_state *rcu_state = &rcu_preempt_state;
|
static struct rcu_state *rcu_state = &rcu_preempt_state;
|
||||||
|
|
||||||
@ -1533,14 +1543,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
|
|||||||
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
|
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
|
||||||
{
|
{
|
||||||
*delta_jiffies = ULONG_MAX;
|
*delta_jiffies = ULONG_MAX;
|
||||||
return rcu_cpu_has_callbacks(cpu);
|
return rcu_cpu_has_callbacks(cpu, NULL);
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
|
|
||||||
*/
|
|
||||||
static void rcu_prepare_for_idle_init(int cpu)
|
|
||||||
{
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1577,16 +1580,6 @@ static void rcu_idle_count_callbacks_posted(void)
|
|||||||
*
|
*
|
||||||
* The following three proprocessor symbols control this state machine:
|
* The following three proprocessor symbols control this state machine:
|
||||||
*
|
*
|
||||||
* RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
|
|
||||||
* to satisfy RCU. Beyond this point, it is better to incur a periodic
|
|
||||||
* scheduling-clock interrupt than to loop through the state machine
|
|
||||||
* at full power.
|
|
||||||
* RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
|
|
||||||
* optional if RCU does not need anything immediately from this
|
|
||||||
* CPU, even if this CPU still has RCU callbacks queued. The first
|
|
||||||
* times through the state machine are mandatory: we need to give
|
|
||||||
* the state machine a chance to communicate a quiescent state
|
|
||||||
* to the RCU core.
|
|
||||||
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
|
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
|
||||||
* to sleep in dyntick-idle mode with RCU callbacks pending. This
|
* to sleep in dyntick-idle mode with RCU callbacks pending. This
|
||||||
* is sized to be roughly one RCU grace period. Those energy-efficiency
|
* is sized to be roughly one RCU grace period. Those energy-efficiency
|
||||||
@ -1602,186 +1595,108 @@ static void rcu_idle_count_callbacks_posted(void)
|
|||||||
* adjustment, they can be converted into kernel config parameters, though
|
* adjustment, they can be converted into kernel config parameters, though
|
||||||
* making the state machine smarter might be a better option.
|
* making the state machine smarter might be a better option.
|
||||||
*/
|
*/
|
||||||
#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
|
|
||||||
#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
|
|
||||||
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
|
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
|
||||||
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
|
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
|
||||||
|
|
||||||
|
static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
|
||||||
|
module_param(rcu_idle_gp_delay, int, 0644);
|
||||||
|
static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
|
||||||
|
module_param(rcu_idle_lazy_gp_delay, int, 0644);
|
||||||
|
|
||||||
extern int tick_nohz_enabled;
|
extern int tick_nohz_enabled;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Does the specified flavor of RCU have non-lazy callbacks pending on
|
* Try to advance callbacks for all flavors of RCU on the current CPU.
|
||||||
* the specified CPU? Both RCU flavor and CPU are specified by the
|
* Afterwards, if there are any callbacks ready for immediate invocation,
|
||||||
* rcu_data structure.
|
* return true.
|
||||||
*/
|
*/
|
||||||
static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
|
static bool rcu_try_advance_all_cbs(void)
|
||||||
{
|
{
|
||||||
return rdp->qlen != rdp->qlen_lazy;
|
bool cbs_ready = false;
|
||||||
}
|
struct rcu_data *rdp;
|
||||||
|
struct rcu_node *rnp;
|
||||||
|
struct rcu_state *rsp;
|
||||||
|
|
||||||
#ifdef CONFIG_TREE_PREEMPT_RCU
|
for_each_rcu_flavor(rsp) {
|
||||||
|
rdp = this_cpu_ptr(rsp->rda);
|
||||||
|
rnp = rdp->mynode;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Are there non-lazy RCU-preempt callbacks? (There cannot be if there
|
* Don't bother checking unless a grace period has
|
||||||
* is no RCU-preempt in the kernel.)
|
* completed since we last checked and there are
|
||||||
*/
|
* callbacks not yet ready to invoke.
|
||||||
static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
|
*/
|
||||||
{
|
if (rdp->completed != rnp->completed &&
|
||||||
struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
|
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
|
||||||
|
rcu_process_gp_end(rsp, rdp);
|
||||||
|
|
||||||
return __rcu_cpu_has_nonlazy_callbacks(rdp);
|
if (cpu_has_callbacks_ready_to_invoke(rdp))
|
||||||
}
|
cbs_ready = true;
|
||||||
|
}
|
||||||
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
|
return cbs_ready;
|
||||||
|
|
||||||
static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Does any flavor of RCU have non-lazy callbacks on the specified CPU?
|
|
||||||
*/
|
|
||||||
static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
|
|
||||||
{
|
|
||||||
return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
|
|
||||||
__rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
|
|
||||||
rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Allow the CPU to enter dyntick-idle mode if either: (1) There are no
|
* Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
|
||||||
* callbacks on this CPU, (2) this CPU has not yet attempted to enter
|
* to invoke. If the CPU has callbacks, try to advance them. Tell the
|
||||||
* dyntick-idle mode, or (3) this CPU is in the process of attempting to
|
* caller to set the timeout based on whether or not there are non-lazy
|
||||||
* enter dyntick-idle mode. Otherwise, if we have recently tried and failed
|
* callbacks.
|
||||||
* to enter dyntick-idle mode, we refuse to try to enter it. After all,
|
|
||||||
* it is better to incur scheduling-clock interrupts than to spin
|
|
||||||
* continuously for the same time duration!
|
|
||||||
*
|
*
|
||||||
* The delta_jiffies argument is used to store the time when RCU is
|
* The caller must have disabled interrupts.
|
||||||
* going to need the CPU again if it still has callbacks. The reason
|
|
||||||
* for this is that rcu_prepare_for_idle() might need to post a timer,
|
|
||||||
* but if so, it will do so after tick_nohz_stop_sched_tick() has set
|
|
||||||
* the wakeup time for this CPU. This means that RCU's timer can be
|
|
||||||
* delayed until the wakeup time, which defeats the purpose of posting
|
|
||||||
* a timer.
|
|
||||||
*/
|
*/
|
||||||
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
|
int rcu_needs_cpu(int cpu, unsigned long *dj)
|
||||||
{
|
{
|
||||||
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
|
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
|
||||||
|
|
||||||
/* Flag a new idle sojourn to the idle-entry state machine. */
|
/* Snapshot to detect later posting of non-lazy callback. */
|
||||||
rdtp->idle_first_pass = 1;
|
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
|
||||||
|
|
||||||
/* If no callbacks, RCU doesn't need the CPU. */
|
/* If no callbacks, RCU doesn't need the CPU. */
|
||||||
if (!rcu_cpu_has_callbacks(cpu)) {
|
if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
|
||||||
*delta_jiffies = ULONG_MAX;
|
*dj = ULONG_MAX;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (rdtp->dyntick_holdoff == jiffies) {
|
|
||||||
/* RCU recently tried and failed, so don't try again. */
|
/* Attempt to advance callbacks. */
|
||||||
*delta_jiffies = 1;
|
if (rcu_try_advance_all_cbs()) {
|
||||||
|
/* Some ready to invoke, so initiate later invocation. */
|
||||||
|
invoke_rcu_core();
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
/* Set up for the possibility that RCU will post a timer. */
|
rdtp->last_accelerate = jiffies;
|
||||||
if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
|
|
||||||
*delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
|
/* Request timer delay depending on laziness, and round. */
|
||||||
RCU_IDLE_GP_DELAY) - jiffies;
|
if (rdtp->all_lazy) {
|
||||||
|
*dj = round_up(rcu_idle_gp_delay + jiffies,
|
||||||
|
rcu_idle_gp_delay) - jiffies;
|
||||||
} else {
|
} else {
|
||||||
*delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
|
*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
|
||||||
*delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
|
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handler for smp_call_function_single(). The only point of this
|
* Prepare a CPU for idle from an RCU perspective. The first major task
|
||||||
* handler is to wake the CPU up, so the handler does only tracing.
|
* is to sense whether nohz mode has been enabled or disabled via sysfs.
|
||||||
*/
|
* The second major task is to check to see if a non-lazy callback has
|
||||||
void rcu_idle_demigrate(void *unused)
|
* arrived at a CPU that previously had only lazy callbacks. The third
|
||||||
{
|
* major task is to accelerate (that is, assign grace-period numbers to)
|
||||||
trace_rcu_prep_idle("Demigrate");
|
* any recently arrived callbacks.
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Timer handler used to force CPU to start pushing its remaining RCU
|
|
||||||
* callbacks in the case where it entered dyntick-idle mode with callbacks
|
|
||||||
* pending. The hander doesn't really need to do anything because the
|
|
||||||
* real work is done upon re-entry to idle, or by the next scheduling-clock
|
|
||||||
* interrupt should idle not be re-entered.
|
|
||||||
*
|
|
||||||
* One special case: the timer gets migrated without awakening the CPU
|
|
||||||
* on which the timer was scheduled on. In this case, we must wake up
|
|
||||||
* that CPU. We do so with smp_call_function_single().
|
|
||||||
*/
|
|
||||||
static void rcu_idle_gp_timer_func(unsigned long cpu_in)
|
|
||||||
{
|
|
||||||
int cpu = (int)cpu_in;
|
|
||||||
|
|
||||||
trace_rcu_prep_idle("Timer");
|
|
||||||
if (cpu != smp_processor_id())
|
|
||||||
smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
|
|
||||||
else
|
|
||||||
WARN_ON_ONCE(1); /* Getting here can hang the system... */
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Initialize the timer used to pull CPUs out of dyntick-idle mode.
|
|
||||||
*/
|
|
||||||
static void rcu_prepare_for_idle_init(int cpu)
|
|
||||||
{
|
|
||||||
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
|
|
||||||
|
|
||||||
rdtp->dyntick_holdoff = jiffies - 1;
|
|
||||||
setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
|
|
||||||
rdtp->idle_gp_timer_expires = jiffies - 1;
|
|
||||||
rdtp->idle_first_pass = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Clean up for exit from idle. Because we are exiting from idle, there
|
|
||||||
* is no longer any point to ->idle_gp_timer, so cancel it. This will
|
|
||||||
* do nothing if this timer is not active, so just cancel it unconditionally.
|
|
||||||
*/
|
|
||||||
static void rcu_cleanup_after_idle(int cpu)
|
|
||||||
{
|
|
||||||
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
|
|
||||||
|
|
||||||
del_timer(&rdtp->idle_gp_timer);
|
|
||||||
trace_rcu_prep_idle("Cleanup after idle");
|
|
||||||
rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Check to see if any RCU-related work can be done by the current CPU,
|
|
||||||
* and if so, schedule a softirq to get it done. This function is part
|
|
||||||
* of the RCU implementation; it is -not- an exported member of the RCU API.
|
|
||||||
*
|
|
||||||
* The idea is for the current CPU to clear out all work required by the
|
|
||||||
* RCU core for the current grace period, so that this CPU can be permitted
|
|
||||||
* to enter dyntick-idle mode. In some cases, it will need to be awakened
|
|
||||||
* at the end of the grace period by whatever CPU ends the grace period.
|
|
||||||
* This allows CPUs to go dyntick-idle more quickly, and to reduce the
|
|
||||||
* number of wakeups by a modest integer factor.
|
|
||||||
*
|
|
||||||
* Because it is not legal to invoke rcu_process_callbacks() with irqs
|
|
||||||
* disabled, we do one pass of force_quiescent_state(), then do a
|
|
||||||
* invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
|
|
||||||
* later. The ->dyntick_drain field controls the sequencing.
|
|
||||||
*
|
*
|
||||||
* The caller must have disabled interrupts.
|
* The caller must have disabled interrupts.
|
||||||
*/
|
*/
|
||||||
static void rcu_prepare_for_idle(int cpu)
|
static void rcu_prepare_for_idle(int cpu)
|
||||||
{
|
{
|
||||||
struct timer_list *tp;
|
struct rcu_data *rdp;
|
||||||
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
|
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
|
||||||
|
struct rcu_node *rnp;
|
||||||
|
struct rcu_state *rsp;
|
||||||
int tne;
|
int tne;
|
||||||
|
|
||||||
/* Handle nohz enablement switches conservatively. */
|
/* Handle nohz enablement switches conservatively. */
|
||||||
tne = ACCESS_ONCE(tick_nohz_enabled);
|
tne = ACCESS_ONCE(tick_nohz_enabled);
|
||||||
if (tne != rdtp->tick_nohz_enabled_snap) {
|
if (tne != rdtp->tick_nohz_enabled_snap) {
|
||||||
if (rcu_cpu_has_callbacks(cpu))
|
if (rcu_cpu_has_callbacks(cpu, NULL))
|
||||||
invoke_rcu_core(); /* force nohz to see update. */
|
invoke_rcu_core(); /* force nohz to see update. */
|
||||||
rdtp->tick_nohz_enabled_snap = tne;
|
rdtp->tick_nohz_enabled_snap = tne;
|
||||||
return;
|
return;
|
||||||
@ -1789,125 +1704,56 @@ static void rcu_prepare_for_idle(int cpu)
|
|||||||
if (!tne)
|
if (!tne)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* Adaptive-tick mode, where usermode execution is idle to RCU. */
|
/* If this is a no-CBs CPU, no callbacks, just return. */
|
||||||
if (!is_idle_task(current)) {
|
if (is_nocb_cpu(cpu))
|
||||||
rdtp->dyntick_holdoff = jiffies - 1;
|
|
||||||
if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
|
|
||||||
trace_rcu_prep_idle("User dyntick with callbacks");
|
|
||||||
rdtp->idle_gp_timer_expires =
|
|
||||||
round_up(jiffies + RCU_IDLE_GP_DELAY,
|
|
||||||
RCU_IDLE_GP_DELAY);
|
|
||||||
} else if (rcu_cpu_has_callbacks(cpu)) {
|
|
||||||
rdtp->idle_gp_timer_expires =
|
|
||||||
round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
|
|
||||||
trace_rcu_prep_idle("User dyntick with lazy callbacks");
|
|
||||||
} else {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
tp = &rdtp->idle_gp_timer;
|
|
||||||
mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
|
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this is an idle re-entry, for example, due to use of
|
* If a non-lazy callback arrived at a CPU having only lazy
|
||||||
* RCU_NONIDLE() or the new idle-loop tracing API within the idle
|
* callbacks, invoke RCU core for the side-effect of recalculating
|
||||||
* loop, then don't take any state-machine actions, unless the
|
* idle duration on re-entry to idle.
|
||||||
* momentary exit from idle queued additional non-lazy callbacks.
|
|
||||||
* Instead, repost the ->idle_gp_timer if this CPU has callbacks
|
|
||||||
* pending.
|
|
||||||
*/
|
*/
|
||||||
if (!rdtp->idle_first_pass &&
|
if (rdtp->all_lazy &&
|
||||||
(rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
|
rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
|
||||||
if (rcu_cpu_has_callbacks(cpu)) {
|
|
||||||
tp = &rdtp->idle_gp_timer;
|
|
||||||
mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
rdtp->idle_first_pass = 0;
|
|
||||||
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If there are no callbacks on this CPU, enter dyntick-idle mode.
|
|
||||||
* Also reset state to avoid prejudicing later attempts.
|
|
||||||
*/
|
|
||||||
if (!rcu_cpu_has_callbacks(cpu)) {
|
|
||||||
rdtp->dyntick_holdoff = jiffies - 1;
|
|
||||||
rdtp->dyntick_drain = 0;
|
|
||||||
trace_rcu_prep_idle("No callbacks");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If in holdoff mode, just return. We will presumably have
|
|
||||||
* refrained from disabling the scheduling-clock tick.
|
|
||||||
*/
|
|
||||||
if (rdtp->dyntick_holdoff == jiffies) {
|
|
||||||
trace_rcu_prep_idle("In holdoff");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Check and update the ->dyntick_drain sequencing. */
|
|
||||||
if (rdtp->dyntick_drain <= 0) {
|
|
||||||
/* First time through, initialize the counter. */
|
|
||||||
rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
|
|
||||||
} else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
|
|
||||||
!rcu_pending(cpu) &&
|
|
||||||
!local_softirq_pending()) {
|
|
||||||
/* Can we go dyntick-idle despite still having callbacks? */
|
|
||||||
rdtp->dyntick_drain = 0;
|
|
||||||
rdtp->dyntick_holdoff = jiffies;
|
|
||||||
if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
|
|
||||||
trace_rcu_prep_idle("Dyntick with callbacks");
|
|
||||||
rdtp->idle_gp_timer_expires =
|
|
||||||
round_up(jiffies + RCU_IDLE_GP_DELAY,
|
|
||||||
RCU_IDLE_GP_DELAY);
|
|
||||||
} else {
|
|
||||||
rdtp->idle_gp_timer_expires =
|
|
||||||
round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
|
|
||||||
trace_rcu_prep_idle("Dyntick with lazy callbacks");
|
|
||||||
}
|
|
||||||
tp = &rdtp->idle_gp_timer;
|
|
||||||
mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
|
|
||||||
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
|
|
||||||
return; /* Nothing more to do immediately. */
|
|
||||||
} else if (--(rdtp->dyntick_drain) <= 0) {
|
|
||||||
/* We have hit the limit, so time to give up. */
|
|
||||||
rdtp->dyntick_holdoff = jiffies;
|
|
||||||
trace_rcu_prep_idle("Begin holdoff");
|
|
||||||
invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Do one step of pushing the remaining RCU callbacks through
|
|
||||||
* the RCU core state machine.
|
|
||||||
*/
|
|
||||||
#ifdef CONFIG_TREE_PREEMPT_RCU
|
|
||||||
if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
|
|
||||||
rcu_preempt_qs(cpu);
|
|
||||||
force_quiescent_state(&rcu_preempt_state);
|
|
||||||
}
|
|
||||||
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
|
|
||||||
if (per_cpu(rcu_sched_data, cpu).nxtlist) {
|
|
||||||
rcu_sched_qs(cpu);
|
|
||||||
force_quiescent_state(&rcu_sched_state);
|
|
||||||
}
|
|
||||||
if (per_cpu(rcu_bh_data, cpu).nxtlist) {
|
|
||||||
rcu_bh_qs(cpu);
|
|
||||||
force_quiescent_state(&rcu_bh_state);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If RCU callbacks are still pending, RCU still needs this CPU.
|
|
||||||
* So try forcing the callbacks through the grace period.
|
|
||||||
*/
|
|
||||||
if (rcu_cpu_has_callbacks(cpu)) {
|
|
||||||
trace_rcu_prep_idle("More callbacks");
|
|
||||||
invoke_rcu_core();
|
invoke_rcu_core();
|
||||||
} else {
|
return;
|
||||||
trace_rcu_prep_idle("Callbacks drained");
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we have not yet accelerated this jiffy, accelerate all
|
||||||
|
* callbacks on this CPU.
|
||||||
|
*/
|
||||||
|
if (rdtp->last_accelerate == jiffies)
|
||||||
|
return;
|
||||||
|
rdtp->last_accelerate = jiffies;
|
||||||
|
for_each_rcu_flavor(rsp) {
|
||||||
|
rdp = per_cpu_ptr(rsp->rda, cpu);
|
||||||
|
if (!*rdp->nxttail[RCU_DONE_TAIL])
|
||||||
|
continue;
|
||||||
|
rnp = rdp->mynode;
|
||||||
|
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
|
||||||
|
rcu_accelerate_cbs(rsp, rnp, rdp);
|
||||||
|
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Clean up for exit from idle. Attempt to advance callbacks based on
|
||||||
|
* any grace periods that elapsed while the CPU was idle, and if any
|
||||||
|
* callbacks are now ready to invoke, initiate invocation.
|
||||||
|
*/
|
||||||
|
static void rcu_cleanup_after_idle(int cpu)
|
||||||
|
{
|
||||||
|
struct rcu_data *rdp;
|
||||||
|
struct rcu_state *rsp;
|
||||||
|
|
||||||
|
if (is_nocb_cpu(cpu))
|
||||||
|
return;
|
||||||
|
rcu_try_advance_all_cbs();
|
||||||
|
for_each_rcu_flavor(rsp) {
|
||||||
|
rdp = per_cpu_ptr(rsp->rda, cpu);
|
||||||
|
if (cpu_has_callbacks_ready_to_invoke(rdp))
|
||||||
|
invoke_rcu_core();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2015,16 +1861,13 @@ early_initcall(rcu_register_oom_notifier);
|
|||||||
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
|
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
|
||||||
{
|
{
|
||||||
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
|
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
|
||||||
struct timer_list *tltp = &rdtp->idle_gp_timer;
|
unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
|
||||||
char c;
|
|
||||||
|
|
||||||
c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
|
sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
|
||||||
if (timer_pending(tltp))
|
rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
|
||||||
sprintf(cp, "drain=%d %c timer=%lu",
|
ulong2long(nlpd),
|
||||||
rdtp->dyntick_drain, c, tltp->expires - jiffies);
|
rdtp->all_lazy ? 'L' : '.',
|
||||||
else
|
rdtp->tick_nohz_enabled_snap ? '.' : 'D');
|
||||||
sprintf(cp, "drain=%d %c timer not pending",
|
|
||||||
rdtp->dyntick_drain, c);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
|
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
|
||||||
@ -2070,10 +1913,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
|
|||||||
ticks_value = rsp->gpnum - rdp->gpnum;
|
ticks_value = rsp->gpnum - rdp->gpnum;
|
||||||
}
|
}
|
||||||
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
|
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
|
||||||
printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
|
printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
|
||||||
cpu, ticks_value, ticks_title,
|
cpu, ticks_value, ticks_title,
|
||||||
atomic_read(&rdtp->dynticks) & 0xfff,
|
atomic_read(&rdtp->dynticks) & 0xfff,
|
||||||
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
|
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
|
||||||
|
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
|
||||||
fast_no_hz);
|
fast_no_hz);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2087,6 +1931,7 @@ static void print_cpu_stall_info_end(void)
|
|||||||
static void zero_cpu_stall_ticks(struct rcu_data *rdp)
|
static void zero_cpu_stall_ticks(struct rcu_data *rdp)
|
||||||
{
|
{
|
||||||
rdp->ticks_this_gp = 0;
|
rdp->ticks_this_gp = 0;
|
||||||
|
rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Increment ->ticks_this_gp for all flavors of RCU. */
|
/* Increment ->ticks_this_gp for all flavors of RCU. */
|
||||||
@ -2165,6 +2010,47 @@ static int __init parse_rcu_nocb_poll(char *arg)
|
|||||||
}
|
}
|
||||||
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
|
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Do any no-CBs CPUs need another grace period?
|
||||||
|
*
|
||||||
|
* Interrupts must be disabled. If the caller does not hold the root
|
||||||
|
* rnp_node structure's ->lock, the results are advisory only.
|
||||||
|
*/
|
||||||
|
static int rcu_nocb_needs_gp(struct rcu_state *rsp)
|
||||||
|
{
|
||||||
|
struct rcu_node *rnp = rcu_get_root(rsp);
|
||||||
|
|
||||||
|
return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
|
||||||
|
* grace period.
|
||||||
|
*/
|
||||||
|
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
|
||||||
|
{
|
||||||
|
wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Set the root rcu_node structure's ->need_future_gp field
|
||||||
|
* based on the sum of those of all rcu_node structures. This does
|
||||||
|
* double-count the root rcu_node structure's requests, but this
|
||||||
|
* is necessary to handle the possibility of a rcu_nocb_kthread()
|
||||||
|
* having awakened during the time that the rcu_node structures
|
||||||
|
* were being updated for the end of the previous grace period.
|
||||||
|
*/
|
||||||
|
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
|
||||||
|
{
|
||||||
|
rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rcu_init_one_nocb(struct rcu_node *rnp)
|
||||||
|
{
|
||||||
|
init_waitqueue_head(&rnp->nocb_gp_wq[0]);
|
||||||
|
init_waitqueue_head(&rnp->nocb_gp_wq[1]);
|
||||||
|
}
|
||||||
|
|
||||||
/* Is the specified CPU a no-CPUs CPU? */
|
/* Is the specified CPU a no-CPUs CPU? */
|
||||||
static bool is_nocb_cpu(int cpu)
|
static bool is_nocb_cpu(int cpu)
|
||||||
{
|
{
|
||||||
@ -2227,6 +2113,13 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
|
|||||||
if (!is_nocb_cpu(rdp->cpu))
|
if (!is_nocb_cpu(rdp->cpu))
|
||||||
return 0;
|
return 0;
|
||||||
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
|
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
|
||||||
|
if (__is_kfree_rcu_offset((unsigned long)rhp->func))
|
||||||
|
trace_rcu_kfree_callback(rdp->rsp->name, rhp,
|
||||||
|
(unsigned long)rhp->func,
|
||||||
|
rdp->qlen_lazy, rdp->qlen);
|
||||||
|
else
|
||||||
|
trace_rcu_callback(rdp->rsp->name, rhp,
|
||||||
|
rdp->qlen_lazy, rdp->qlen);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2265,95 +2158,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* There must be at least one non-no-CBs CPU in operation at any given
|
* If necessary, kick off a new grace period, and either way wait
|
||||||
* time, because no-CBs CPUs are not capable of initiating grace periods
|
* for a subsequent grace period to complete.
|
||||||
* independently. This function therefore complains if the specified
|
|
||||||
* CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
|
|
||||||
* avoid offlining the last such CPU. (Recursion is a wonderful thing,
|
|
||||||
* but you have to have a base case!)
|
|
||||||
*/
|
*/
|
||||||
static bool nocb_cpu_expendable(int cpu)
|
static void rcu_nocb_wait_gp(struct rcu_data *rdp)
|
||||||
{
|
{
|
||||||
cpumask_var_t non_nocb_cpus;
|
unsigned long c;
|
||||||
int ret;
|
bool d;
|
||||||
|
unsigned long flags;
|
||||||
|
struct rcu_node *rnp = rdp->mynode;
|
||||||
|
|
||||||
|
raw_spin_lock_irqsave(&rnp->lock, flags);
|
||||||
|
c = rcu_start_future_gp(rnp, rdp);
|
||||||
|
raw_spin_unlock_irqrestore(&rnp->lock, flags);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
|
* Wait for the grace period. Do so interruptibly to avoid messing
|
||||||
* then offlining this CPU is harmless. Let it happen.
|
* up the load average.
|
||||||
*/
|
*/
|
||||||
if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
|
trace_rcu_future_gp(rnp, rdp, c, "StartWait");
|
||||||
return 1;
|
for (;;) {
|
||||||
|
wait_event_interruptible(
|
||||||
/* If no memory, play it safe and keep the CPU around. */
|
rnp->nocb_gp_wq[c & 0x1],
|
||||||
if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
|
(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
|
||||||
return 0;
|
if (likely(d))
|
||||||
cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
|
break;
|
||||||
cpumask_clear_cpu(cpu, non_nocb_cpus);
|
flush_signals(current);
|
||||||
ret = !cpumask_empty(non_nocb_cpus);
|
trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
|
||||||
free_cpumask_var(non_nocb_cpus);
|
}
|
||||||
return ret;
|
trace_rcu_future_gp(rnp, rdp, c, "EndWait");
|
||||||
}
|
smp_mb(); /* Ensure that CB invocation happens after GP end. */
|
||||||
|
|
||||||
/*
|
|
||||||
* Helper structure for remote registry of RCU callbacks.
|
|
||||||
* This is needed for when a no-CBs CPU needs to start a grace period.
|
|
||||||
* If it just invokes call_rcu(), the resulting callback will be queued,
|
|
||||||
* which can result in deadlock.
|
|
||||||
*/
|
|
||||||
struct rcu_head_remote {
|
|
||||||
struct rcu_head *rhp;
|
|
||||||
call_rcu_func_t *crf;
|
|
||||||
void (*func)(struct rcu_head *rhp);
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Register a callback as specified by the rcu_head_remote struct.
|
|
||||||
* This function is intended to be invoked via smp_call_function_single().
|
|
||||||
*/
|
|
||||||
static void call_rcu_local(void *arg)
|
|
||||||
{
|
|
||||||
struct rcu_head_remote *rhrp =
|
|
||||||
container_of(arg, struct rcu_head_remote, rhp);
|
|
||||||
|
|
||||||
rhrp->crf(rhrp->rhp, rhrp->func);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Set up an rcu_head_remote structure and the invoke call_rcu_local()
|
|
||||||
* on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
|
|
||||||
* smp_call_function_single().
|
|
||||||
*/
|
|
||||||
static void invoke_crf_remote(struct rcu_head *rhp,
|
|
||||||
void (*func)(struct rcu_head *rhp),
|
|
||||||
call_rcu_func_t crf)
|
|
||||||
{
|
|
||||||
struct rcu_head_remote rhr;
|
|
||||||
|
|
||||||
rhr.rhp = rhp;
|
|
||||||
rhr.crf = crf;
|
|
||||||
rhr.func = func;
|
|
||||||
smp_call_function_single(0, call_rcu_local, &rhr, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Helper functions to be passed to wait_rcu_gp(), each of which
|
|
||||||
* invokes invoke_crf_remote() to register a callback appropriately.
|
|
||||||
*/
|
|
||||||
static void __maybe_unused
|
|
||||||
call_rcu_preempt_remote(struct rcu_head *rhp,
|
|
||||||
void (*func)(struct rcu_head *rhp))
|
|
||||||
{
|
|
||||||
invoke_crf_remote(rhp, func, call_rcu);
|
|
||||||
}
|
|
||||||
static void call_rcu_bh_remote(struct rcu_head *rhp,
|
|
||||||
void (*func)(struct rcu_head *rhp))
|
|
||||||
{
|
|
||||||
invoke_crf_remote(rhp, func, call_rcu_bh);
|
|
||||||
}
|
|
||||||
static void call_rcu_sched_remote(struct rcu_head *rhp,
|
|
||||||
void (*func)(struct rcu_head *rhp))
|
|
||||||
{
|
|
||||||
invoke_crf_remote(rhp, func, call_rcu_sched);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2390,7 +2224,7 @@ static int rcu_nocb_kthread(void *arg)
|
|||||||
cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
|
cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
|
||||||
ACCESS_ONCE(rdp->nocb_p_count) += c;
|
ACCESS_ONCE(rdp->nocb_p_count) += c;
|
||||||
ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
|
ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
|
||||||
wait_rcu_gp(rdp->rsp->call_remote);
|
rcu_nocb_wait_gp(rdp);
|
||||||
|
|
||||||
/* Each pass through the following loop invokes a callback. */
|
/* Each pass through the following loop invokes a callback. */
|
||||||
trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
|
trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
|
||||||
@ -2436,33 +2270,42 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
|
|||||||
return;
|
return;
|
||||||
for_each_cpu(cpu, rcu_nocb_mask) {
|
for_each_cpu(cpu, rcu_nocb_mask) {
|
||||||
rdp = per_cpu_ptr(rsp->rda, cpu);
|
rdp = per_cpu_ptr(rsp->rda, cpu);
|
||||||
t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
|
t = kthread_run(rcu_nocb_kthread, rdp,
|
||||||
|
"rcuo%c/%d", rsp->abbr, cpu);
|
||||||
BUG_ON(IS_ERR(t));
|
BUG_ON(IS_ERR(t));
|
||||||
ACCESS_ONCE(rdp->nocb_kthread) = t;
|
ACCESS_ONCE(rdp->nocb_kthread) = t;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
|
/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
|
||||||
static void init_nocb_callback_list(struct rcu_data *rdp)
|
static bool init_nocb_callback_list(struct rcu_data *rdp)
|
||||||
{
|
{
|
||||||
if (rcu_nocb_mask == NULL ||
|
if (rcu_nocb_mask == NULL ||
|
||||||
!cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
|
!cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
|
||||||
return;
|
return false;
|
||||||
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
|
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
|
||||||
}
|
return true;
|
||||||
|
|
||||||
/* Initialize the ->call_remote fields in the rcu_state structures. */
|
|
||||||
static void __init rcu_init_nocb(void)
|
|
||||||
{
|
|
||||||
#ifdef CONFIG_PREEMPT_RCU
|
|
||||||
rcu_preempt_state.call_remote = call_rcu_preempt_remote;
|
|
||||||
#endif /* #ifdef CONFIG_PREEMPT_RCU */
|
|
||||||
rcu_bh_state.call_remote = call_rcu_bh_remote;
|
|
||||||
rcu_sched_state.call_remote = call_rcu_sched_remote;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
|
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
|
||||||
|
|
||||||
|
static int rcu_nocb_needs_gp(struct rcu_state *rsp)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rcu_init_one_nocb(struct rcu_node *rnp)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
static bool is_nocb_cpu(int cpu)
|
static bool is_nocb_cpu(int cpu)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
@ -2480,11 +2323,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool nocb_cpu_expendable(int cpu)
|
|
||||||
{
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
|
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@ -2493,12 +2331,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_nocb_callback_list(struct rcu_data *rdp)
|
static bool init_nocb_callback_list(struct rcu_data *rdp)
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __init rcu_init_nocb(void)
|
|
||||||
{
|
{
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
|
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
|
||||||
|
@ -46,8 +46,6 @@
|
|||||||
#define RCU_TREE_NONCORE
|
#define RCU_TREE_NONCORE
|
||||||
#include "rcutree.h"
|
#include "rcutree.h"
|
||||||
|
|
||||||
#define ulong2long(a) (*(long *)(&(a)))
|
|
||||||
|
|
||||||
static int r_open(struct inode *inode, struct file *file,
|
static int r_open(struct inode *inode, struct file *file,
|
||||||
const struct seq_operations *op)
|
const struct seq_operations *op)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user