From 15c7c972cd26d89a26788e609c53b5a465324a6c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 7 Oct 2019 18:53:18 -0700
Subject: [PATCH 01/69] rcu: Use *_ONCE() to protect lockless ->expmask
 accesses

The rcu_node structure's ->expmask field is accessed locklessly when
starting a new expedited grace period and when reporting an expedited
RCU CPU stall warning.  This commit therefore handles the former by
taking a snapshot of ->expmask while the lock is held and the latter
by applying READ_ONCE() to lockless reads and WRITE_ONCE() to the
corresponding updates.

Link: https://lore.kernel.org/lkml/CANpmjNNmSOagbTpffHr4=Yedckx9Rm2NuGqC9UqE+AOz5f1-ZQ@mail.gmail.com
Reported-by: syzbot+134336b86f728d6e55a0@syzkaller.appspotmail.com
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Marco Elver <elver@google.com>
---
 kernel/rcu/tree_exp.h | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d632cd019597..69c5aa64fcfd 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -134,7 +134,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
 	rcu_for_each_node_breadth_first(rnp) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		WARN_ON_ONCE(rnp->expmask);
-		rnp->expmask = rnp->expmaskinit;
+		WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 }
@@ -211,7 +211,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 		rnp = rnp->parent;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
 		WARN_ON_ONCE(!(rnp->expmask & mask));
-		rnp->expmask &= ~mask;
+		WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
 	}
 }
 
@@ -241,7 +241,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
-	rnp->expmask &= ~mask;
+	WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
 	__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
 }
 
@@ -372,12 +372,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 	/* IPI the remaining CPUs for expedited quiescent state. */
-	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+	for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
 		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 		struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 
-		if (!(mask_ofl_ipi & mask))
-			continue;
 retry_ipi:
 		if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
 			mask_ofl_test |= mask;
@@ -491,7 +489,7 @@ static void synchronize_sched_expedited_wait(void)
 				struct rcu_data *rdp;
 
 				mask = leaf_node_cpu_bit(rnp, cpu);
-				if (!(rnp->expmask & mask))
+				if (!(READ_ONCE(rnp->expmask) & mask))
 					continue;
 				ndetected++;
 				rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -503,7 +501,8 @@ static void synchronize_sched_expedited_wait(void)
 		}
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
 			jiffies - jiffies_start, rcu_state.expedited_sequence,
-			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+			READ_ONCE(rnp_root->expmask),
+			".T"[!!rnp_root->exp_tasks]);
 		if (ndetected) {
 			pr_err("blocking rcu_node structures:");
 			rcu_for_each_node_breadth_first(rnp) {
@@ -513,7 +512,7 @@ static void synchronize_sched_expedited_wait(void)
 					continue;
 				pr_cont(" l=%u:%d-%d:%#lx/%c",
 					rnp->level, rnp->grplo, rnp->grphi,
-					rnp->expmask,
+					READ_ONCE(rnp->expmask),
 					".T"[!!rnp->exp_tasks]);
 			}
 			pr_cont("\n");
@@ -521,7 +520,7 @@ static void synchronize_sched_expedited_wait(void)
 		rcu_for_each_leaf_node(rnp) {
 			for_each_leaf_node_possible_cpu(rnp, cpu) {
 				mask = leaf_node_cpu_bit(rnp, cpu);
-				if (!(rnp->expmask & mask))
+				if (!(READ_ONCE(rnp->expmask) & mask))
 					continue;
 				dump_cpu_task(cpu);
 			}

From 9f08cf088676c12a5b53bd5a29cf04f00c787b5d Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 8 Oct 2019 13:01:40 +0800
Subject: [PATCH 02/69] rcu: Avoid modifying mask_ofl_ipi in
 sync_rcu_exp_select_node_cpus()

The "mask_ofl_ipi" is used to track which CPUs get IPIed, however
in the IPI sending loop, "mask_ofl_ipi" along with another variable
"mask_ofl_test" might also get modified to record which CPUs' quiesent
states must be reported by the sync_rcu_exp_select_node_cpus() at
the end of sync_rcu_exp_select_node_cpus().  This overlap of roles
can be confusing, so this patch cleans things a little by using
"mask_ofl_ipi" solely for determining which CPUs must be IPIed  and
"mask_ofl_test" for solely determining on behalf of  which CPUs
sync_rcu_exp_select_node_cpus() must report a quiscent state.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Acked-by: Marco Elver <elver@google.com>
---
 kernel/rcu/tree_exp.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 69c5aa64fcfd..6a6f328a5f52 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -387,10 +387,10 @@ retry_ipi:
 		}
 		ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
 		put_cpu();
-		if (!ret) {
-			mask_ofl_ipi &= ~mask;
+		/* The CPU will report the QS in response to the IPI. */
+		if (!ret)
 			continue;
-		}
+
 		/* Failed, raced with CPU hotplug operation. */
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if ((rnp->qsmaskinitnext & mask) &&
@@ -401,13 +401,12 @@ retry_ipi:
 			schedule_timeout_uninterruptible(1);
 			goto retry_ipi;
 		}
-		/* CPU really is offline, so we can ignore it. */
-		if (!(rnp->expmask & mask))
-			mask_ofl_ipi &= ~mask;
+		/* CPU really is offline, so we must report its QS. */
+		if (rnp->expmask & mask)
+			mask_ofl_test |= mask;
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 	/* Report quiescent states for those that went offline. */
-	mask_ofl_test |= mask_ofl_ipi;
 	if (mask_ofl_test)
 		rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
 }

From 6cf539a87a61a4fbc43f625267dbcbcf283872ed Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 9 Oct 2019 17:57:43 +0200
Subject: [PATCH 03/69] rcu: Fix data-race due to atomic_t copy-by-value

This fixes a data-race where `atomic_t dynticks` is copied by value. The
copy is performed non-atomically, resulting in a data-race if `dynticks`
is updated concurrently.

This data-race was found with KCSAN:
==================================================================
BUG: KCSAN: data-race in dyntick_save_progress_counter / rcu_irq_enter

write to 0xffff989dbdbe98e0 of 4 bytes by task 10 on cpu 3:
 atomic_add_return include/asm-generic/atomic-instrumented.h:78 [inline]
 rcu_dynticks_snap kernel/rcu/tree.c:310 [inline]
 dyntick_save_progress_counter+0x43/0x1b0 kernel/rcu/tree.c:984
 force_qs_rnp+0x183/0x200 kernel/rcu/tree.c:2286
 rcu_gp_fqs kernel/rcu/tree.c:1601 [inline]
 rcu_gp_fqs_loop+0x71/0x880 kernel/rcu/tree.c:1653
 rcu_gp_kthread+0x22c/0x3b0 kernel/rcu/tree.c:1799
 kthread+0x1b5/0x200 kernel/kthread.c:255
 <snip>

read to 0xffff989dbdbe98e0 of 4 bytes by task 154 on cpu 7:
 rcu_nmi_enter_common kernel/rcu/tree.c:828 [inline]
 rcu_irq_enter+0xda/0x240 kernel/rcu/tree.c:870
 irq_enter+0x5/0x50 kernel/softirq.c:347
 <snip>

Reported by Kernel Concurrency Sanitizer on:
CPU: 7 PID: 154 Comm: kworker/7:1H Not tainted 5.3.0+ #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
Workqueue: kblockd blk_mq_run_work_fn
==================================================================

Signed-off-by: Marco Elver <elver@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: rcu@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/trace/events/rcu.h |  4 ++--
 kernel/rcu/tree.c          | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 66122602bd08..697e2c0624dc 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -449,7 +449,7 @@ TRACE_EVENT_RCU(rcu_fqs,
  */
 TRACE_EVENT_RCU(rcu_dyntick,
 
-	TP_PROTO(const char *polarity, long oldnesting, long newnesting, atomic_t dynticks),
+	TP_PROTO(const char *polarity, long oldnesting, long newnesting, int dynticks),
 
 	TP_ARGS(polarity, oldnesting, newnesting, dynticks),
 
@@ -464,7 +464,7 @@ TRACE_EVENT_RCU(rcu_dyntick,
 		__entry->polarity = polarity;
 		__entry->oldnesting = oldnesting;
 		__entry->newnesting = newnesting;
-		__entry->dynticks = atomic_read(&dynticks);
+		__entry->dynticks = dynticks;
 	),
 
 	TP_printk("%s %lx %lx %#3x", __entry->polarity,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1694a6b57ad8..6145e08a1407 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -577,7 +577,7 @@ static void rcu_eqs_enter(bool user)
 	}
 
 	lockdep_assert_irqs_disabled();
-	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks);
+	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	rdp = this_cpu_ptr(&rcu_data);
 	do_nocb_deferred_wakeup(rdp);
@@ -650,14 +650,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
 	 * leave it in non-RCU-idle state.
 	 */
 	if (rdp->dynticks_nmi_nesting != 1) {
-		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
+		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
+				  atomic_read(&rdp->dynticks));
 		WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
 			   rdp->dynticks_nmi_nesting - 2);
 		return;
 	}
 
 	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
-	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks);
+	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 
 	if (irq)
@@ -744,7 +745,7 @@ static void rcu_eqs_exit(bool user)
 	rcu_dynticks_task_exit();
 	rcu_dynticks_eqs_exit();
 	rcu_cleanup_after_idle();
-	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
+	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	WRITE_ONCE(rdp->dynticks_nesting, 1);
 	WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -833,7 +834,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
 	}
 	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
 			  rdp->dynticks_nmi_nesting,
-			  rdp->dynticks_nmi_nesting + incby, rdp->dynticks);
+			  rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
 		   rdp->dynticks_nmi_nesting + incby);
 	barrier();

From aca2991a25da03ca96127b1d21e1f4aba41f81a6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 30 Oct 2019 06:51:57 -0700
Subject: [PATCH 04/69] rcu: Substitute lookup for bit-twiddling in
 sync_rcu_exp_select_node_cpus()

The code in sync_rcu_exp_select_node_cpus() calculates the current
CPU's mask within its rcu_node structure's bitmasks, but this has
already been computed in the ->grpmask field of that CPU's rcu_data
structure.  This commit therefore just uses this ->grpmask field.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6a6f328a5f52..3b59c3ee42e5 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -345,8 +345,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 	/* Each pass checks a CPU for identity, offline, and idle. */
 	mask_ofl_test = 0;
 	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 		struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+		unsigned long mask = rdp->grpmask;
 		int snap;
 
 		if (raw_smp_processor_id() == cpu ||
@@ -373,8 +373,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 
 	/* IPI the remaining CPUs for expedited quiescent state. */
 	for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
-		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 		struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+		unsigned long mask = rdp->grpmask;
 
 retry_ipi:
 		if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {

From fd6bc19d7676a060a171d1cf3dcbf6fd797eb05f Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <neeraju@codeaurora.org>
Date: Tue, 19 Nov 2019 03:17:07 +0000
Subject: [PATCH 05/69] rcu: Fix missed wakeup of exp_wq waiters

Tasks waiting within exp_funnel_lock() for an expedited grace period to
elapse can be starved due to the following sequence of events:

1.	Tasks A and B both attempt to start an expedited grace
	period at about the same time.	This grace period will have
	completed when the lower four bits of the rcu_state structure's
	->expedited_sequence field are 0b'0100', for example, when the
	initial value of this counter is zero.	Task A wins, and thus
	does the actual work of starting the grace period, including
	acquiring the rcu_state structure's .exp_mutex and sets the
	counter to 0b'0001'.

2.	Because task B lost the race to start the grace period, it
	waits on ->expedited_sequence to reach 0b'0100' inside of
	exp_funnel_lock(). This task therefore blocks on the rcu_node
	structure's ->exp_wq[1] field, keeping in mind that the
	end-of-grace-period value of ->expedited_sequence (0b'0100')
	is shifted down two bits before indexing the ->exp_wq[] field.

3.	Task C attempts to start another expedited grace period,
	but blocks on ->exp_mutex, which is still held by Task A.

4.	The aforementioned expedited grace period completes, so that
	->expedited_sequence now has the value 0b'0100'.  A kworker task
	therefore acquires the rcu_state structure's ->exp_wake_mutex
	and starts awakening any tasks waiting for this grace period.

5.	One of the first tasks awakened happens to be Task A.  Task A
	therefore releases the rcu_state structure's ->exp_mutex,
	which allows Task C to start the next expedited grace period,
	which causes the lower four bits of the rcu_state structure's
	->expedited_sequence field to become 0b'0101'.

6.	Task C's expedited grace period completes, so that the lower four
	bits of the rcu_state structure's ->expedited_sequence field now
	become 0b'1000'.

7.	The kworker task from step 4 above continues its wakeups.
	Unfortunately, the wake_up_all() refetches the rcu_state
	structure's .expedited_sequence field:

	wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);

	This results in the wakeup being applied to the rcu_node
	structure's ->exp_wq[2] field, which is unfortunate given that
	Task B is instead waiting on ->exp_wq[1].

On a busy system, no harm is done (or at least no permanent harm is done).
Some later expedited grace period will redo the wakeup.  But on a quiet
system, such as many embedded systems, it might be a good long time before
there was another expedited grace period.  On such embedded systems,
this situation could therefore result in a system hang.

This issue manifested as DPM device timeout during suspend (which
usually qualifies as a quiet time) due to a SCSI device being stuck in
_synchronize_rcu_expedited(), with the following stack trace:

	schedule()
	synchronize_rcu_expedited()
	synchronize_rcu()
	scsi_device_quiesce()
	scsi_bus_suspend()
	dpm_run_callback()
	__device_suspend()

This commit therefore prevents such delays, timeouts, and hangs by
making rcu_exp_wait_wake() use its "s" argument consistently instead of
refetching from rcu_state.expedited_sequence.

Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period")
Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 3b59c3ee42e5..fa143e40cd93 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -557,7 +557,7 @@ static void rcu_exp_wait_wake(unsigned long s)
 			spin_unlock(&rnp->exp_lock);
 		}
 		smp_mb(); /* All above changes before wakeup. */
-		wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);
+		wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
 	}
 	trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
 	mutex_unlock(&rcu_state.exp_wake_mutex);

From 4bc6b745e5cbefed92c48071e28a5f41246d0470 Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <neeraju@codeaurora.org>
Date: Tue, 19 Nov 2019 11:50:52 -0800
Subject: [PATCH 06/69] rcu: Allow only one expedited GP to run concurrently
 with wakeups

The current expedited RCU grace-period code expects that a task
requesting an expedited grace period cannot awaken until that grace
period has reached the wakeup phase.  However, it is possible for a long
preemption to result in the waiting task never sleeping.  For example,
consider the following sequence of events:

1.	Task A starts an expedited grace period by invoking
	synchronize_rcu_expedited().  It proceeds normally up to the
	wait_event() near the end of that function, and is then preempted
	(or interrupted or whatever).

2.	The expedited grace period completes, and a kworker task starts
	the awaken phase, having incremented the counter and acquired
	the rcu_state structure's .exp_wake_mutex.  This kworker task
	is then preempted or interrupted or whatever.

3.	Task A resumes and enters wait_event(), which notes that the
	expedited grace period has completed, and thus doesn't sleep.

4.	Task B starts an expedited grace period exactly as did Task A,
	complete with the preemption (or whatever delay) just before
	the call to wait_event().

5.	The expedited grace period completes, and another kworker
	task starts the awaken phase, having incremented the counter.
	However, it blocks when attempting to acquire the rcu_state
	structure's .exp_wake_mutex because step 2's kworker task has
	not yet released it.

6.	Steps 4 and 5 repeat, resulting in overflow of the rcu_node
	structure's ->exp_wq[] array.

In theory, this is harmless.  Tasks waiting on the various ->exp_wq[]
array will just be spuriously awakened, but they will just sleep again
on noting that the rcu_state structure's ->expedited_sequence value has
not advanced far enough.

In practice, this wastes CPU time and is an accident waiting to happen.
This commit therefore moves the rcu_exp_gp_seq_end() call that officially
ends the expedited grace period (along with associate tracing) until
after the ->exp_wake_mutex has been acquired.  This prevents Task A from
awakening prematurely, thus preventing more than one expedited grace
period from being in flight during a previous expedited grace period's
wakeup phase.

Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period")
Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
[ paulmck: Added updated comment. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index fa143e40cd93..7a1f09376e62 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -539,15 +539,14 @@ static void rcu_exp_wait_wake(unsigned long s)
 	struct rcu_node *rnp;
 
 	synchronize_sched_expedited_wait();
+
+	// Switch over to wakeup mode, allowing the next GP to proceed.
+	// End the previous grace period only after acquiring the mutex
+	// to ensure that only one GP runs concurrently with wakeups.
+	mutex_lock(&rcu_state.exp_wake_mutex);
 	rcu_exp_gp_seq_end();
 	trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
 
-	/*
-	 * Switch over to wakeup mode, allowing the next GP, but -only- the
-	 * next GP, to proceed.
-	 */
-	mutex_lock(&rcu_state.exp_wake_mutex);
-
 	rcu_for_each_node_breadth_first(rnp) {
 		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
 			spin_lock(&rnp->exp_lock);

From 6c7d7dbf5b7f965eda0d39fbbb8fee005b08f340 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 27 Nov 2019 13:59:37 -0800
Subject: [PATCH 07/69] rcu: Rename sync_rcu_preempt_exp_done() to
 sync_rcu_exp_done()

Now that the RCU flavors have been consolidated, there is one common
function for checking to see if an expedited RCU grace period has
completed, namely sync_rcu_preempt_exp_done().  Because this function is
no longer specific to RCU-preempt, this commit removes the "_preempt" from
its name.  This commit also changes sync_rcu_preempt_exp_done_unlocked()
to sync_rcu_exp_done_unlocked() for the same reason.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h    | 19 +++++++++----------
 kernel/rcu/tree_plugin.h |  4 ++--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 7a1f09376e62..3923c0743c3e 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -148,7 +148,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
  *
  * Caller must hold the specificed rcu_node structure's ->lock
  */
-static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+static bool sync_rcu_exp_done(struct rcu_node *rnp)
 {
 	raw_lockdep_assert_held_rcu_node(rnp);
 
@@ -157,17 +157,16 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 }
 
 /*
- * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
- * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
- * itself
+ * Like sync_rcu_exp_done(), but this function assumes the caller doesn't
+ * hold the rcu_node's ->lock, and will acquire and release the lock itself
  */
-static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
 {
 	unsigned long flags;
 	bool ret;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	ret = sync_rcu_preempt_exp_done(rnp);
+	ret = sync_rcu_exp_done(rnp);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 	return ret;
@@ -191,7 +190,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 	unsigned long mask;
 
 	for (;;) {
-		if (!sync_rcu_preempt_exp_done(rnp)) {
+		if (!sync_rcu_exp_done(rnp)) {
 			if (!rnp->expmask)
 				rcu_initiate_boost(rnp, flags);
 			else
@@ -471,9 +470,9 @@ static void synchronize_sched_expedited_wait(void)
 	for (;;) {
 		ret = swait_event_timeout_exclusive(
 				rcu_state.expedited_wq,
-				sync_rcu_preempt_exp_done_unlocked(rnp_root),
+				sync_rcu_exp_done_unlocked(rnp_root),
 				jiffies_stall);
-		if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
+		if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root))
 			return;
 		WARN_ON(ret < 0);  /* workqueues should not be signaled. */
 		if (rcu_cpu_stall_suppress)
@@ -507,7 +506,7 @@ static void synchronize_sched_expedited_wait(void)
 			rcu_for_each_node_breadth_first(rnp) {
 				if (rnp == rnp_root)
 					continue; /* printed unconditionally */
-				if (sync_rcu_preempt_exp_done_unlocked(rnp))
+				if (sync_rcu_exp_done_unlocked(rnp))
 					continue;
 				pr_cont(" l=%u:%d-%d:%#lx/%c",
 					rnp->level, rnp->grplo, rnp->grphi,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fa08d55f7040..6dbea4bcf065 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -485,7 +485,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
 			     (!empty_norm || rnp->qsmask));
-		empty_exp = sync_rcu_preempt_exp_done(rnp);
+		empty_exp = sync_rcu_exp_done(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
 		list_del_init(&t->rcu_node_entry);
@@ -509,7 +509,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
 		 * so we must take a snapshot of the expedited state.
 		 */
-		empty_exp_now = sync_rcu_preempt_exp_done(rnp);
+		empty_exp_now = sync_rcu_exp_done(rnp);
 		if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
 			trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
 							 rnp->gp_seq,

From de8cd0a533bfb57ff4ec6c85e3bdca013a5adcb7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 27 Nov 2019 14:20:41 -0800
Subject: [PATCH 08/69] rcu: Update tree_exp.h function-header comments

The function-header comments in kernel/rcu/tree_exp.h have gotten a bit
out of date, so this commit updates a number of them.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 3923c0743c3e..1eafbcd56679 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void)
 }
 
 /*
- * Return then value that expedited-grace-period counter will have
+ * Return the value that the expedited-grace-period counter will have
  * at the end of the current grace period.
  */
 static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
@@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void)
 }
 
 /*
- * Take a snapshot of the expedited-grace-period counter.
+ * Take a snapshot of the expedited-grace-period counter, which is the
+ * earliest value that will indicate that a full grace period has
+ * elapsed since the current time.
  */
 static unsigned long rcu_exp_gp_seq_snap(void)
 {
@@ -143,22 +145,18 @@ static void __maybe_unused sync_exp_reset_tree(void)
  * Return non-zero if there is no RCU expedited grace period in progress
  * for the specified rcu_node structure, in other words, if all CPUs and
  * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period.  Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the specificed rcu_node structure's ->lock
+ * for the current expedited grace period.
  */
 static bool sync_rcu_exp_done(struct rcu_node *rnp)
 {
 	raw_lockdep_assert_held_rcu_node(rnp);
-
 	return rnp->exp_tasks == NULL &&
 	       READ_ONCE(rnp->expmask) == 0;
 }
 
 /*
- * Like sync_rcu_exp_done(), but this function assumes the caller doesn't
- * hold the rcu_node's ->lock, and will acquire and release the lock itself
+ * Like sync_rcu_exp_done(), but where the caller does not hold the
+ * rcu_node's ->lock.
  */
 static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
 {
@@ -180,8 +178,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
  * which the task was queued or to one of that rcu_node structure's ancestors,
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
- *
- * Caller must hold the specified rcu_node structure's ->lock.
  */
 static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 				 bool wake, unsigned long flags)
@@ -189,6 +185,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 {
 	unsigned long mask;
 
+	raw_lockdep_assert_held_rcu_node(rnp);
 	for (;;) {
 		if (!sync_rcu_exp_done(rnp)) {
 			if (!rnp->expmask)
@@ -452,6 +449,10 @@ static void sync_rcu_exp_select_cpus(void)
 			flush_work(&rnp->rew.rew_work);
 }
 
+/*
+ * Wait for the expedited grace period to elapse, issuing any needed
+ * RCU CPU stall warnings along the way.
+ */
 static void synchronize_sched_expedited_wait(void)
 {
 	int cpu;
@@ -781,7 +782,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
  * implementations, it is still unfriendly to real-time workloads, so is
  * thus not recommended for any sort of common-case code.  In fact, if
  * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
+ * your code to batch your updates, and then use a single synchronize_rcu()
  * instead.
  *
  * This has the same semantics as (but is more brutal than) synchronize_rcu().

From 28f0361fdfab267a392cd6a6401446c9ea64de95 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 27 Nov 2019 14:24:58 -0800
Subject: [PATCH 09/69] rcu: Replace synchronize_sched_expedited_wait()
 "_sched" with "_rcu"

After RCU flavor consolidation, synchronize_sched_expedited_wait() does
both RCU-preempt and RCU-sched, whichever happens to have been built into
the running kernel.  This commit therefore changes this function's name
to synchronize_rcu_expedited_wait() to reflect its new generic nature.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 1eafbcd56679..081a17942e57 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -453,7 +453,7 @@ static void sync_rcu_exp_select_cpus(void)
  * Wait for the expedited grace period to elapse, issuing any needed
  * RCU CPU stall warnings along the way.
  */
-static void synchronize_sched_expedited_wait(void)
+static void synchronize_rcu_expedited_wait(void)
 {
 	int cpu;
 	unsigned long jiffies_stall;
@@ -538,7 +538,7 @@ static void rcu_exp_wait_wake(unsigned long s)
 {
 	struct rcu_node *rnp;
 
-	synchronize_sched_expedited_wait();
+	synchronize_rcu_expedited_wait();
 
 	// Switch over to wakeup mode, allowing the next GP to proceed.
 	// End the previous grace period only after acquiring the mutex

From df1e849ae4559544ff00ff5052eefe2479750539 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 27 Nov 2019 16:36:45 -0800
Subject: [PATCH 10/69] rcu: Enable tick for nohz_full CPUs slow to provide
 expedited QS

An expedited grace period can be stalled by a nohz_full CPU looping
in kernel context.  This possibility is currently handled by some
carefully crafted checks in rcu_read_unlock_special() that enlist help
from ksoftirqd when permitted by the scheduler.  However, it is exactly
these checks that require the scheduler avoid holding any of its rq or
pi locks across rcu_read_unlock() without also having held them across
the entire RCU read-side critical section.

It would therefore be very nice if expedited grace periods could
handle nohz_full CPUs looping in kernel context without such checks.
This commit therefore adds code to the expedited grace period's wait
and cleanup code that forces the scheduler-clock interrupt on for CPUs
that fail to quickly supply a quiescent state.  "Quickly" is currently
a hard-coded single-jiffy delay.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/tick.h  |  5 ++++-
 kernel/rcu/tree.h     |  1 +
 kernel/rcu/tree_exp.h | 52 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7896f792d3b0..7340613c7eff 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -109,8 +109,10 @@ enum tick_dep_bits {
 	TICK_DEP_BIT_PERF_EVENTS	= 1,
 	TICK_DEP_BIT_SCHED		= 2,
 	TICK_DEP_BIT_CLOCK_UNSTABLE	= 3,
-	TICK_DEP_BIT_RCU		= 4
+	TICK_DEP_BIT_RCU		= 4,
+	TICK_DEP_BIT_RCU_EXP		= 5
 };
+#define TICK_DEP_BIT_MAX TICK_DEP_BIT_RCU_EXP
 
 #define TICK_DEP_MASK_NONE		0
 #define TICK_DEP_MASK_POSIX_TIMER	(1 << TICK_DEP_BIT_POSIX_TIMER)
@@ -118,6 +120,7 @@ enum tick_dep_bits {
 #define TICK_DEP_MASK_SCHED		(1 << TICK_DEP_BIT_SCHED)
 #define TICK_DEP_MASK_CLOCK_UNSTABLE	(1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
 #define TICK_DEP_MASK_RCU		(1 << TICK_DEP_BIT_RCU)
+#define TICK_DEP_MASK_RCU_EXP		(1 << TICK_DEP_BIT_RCU_EXP)
 
 #ifdef CONFIG_NO_HZ_COMMON
 extern bool tick_nohz_enabled;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 055c31781d3a..f9253ed406ba 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -182,6 +182,7 @@ struct rcu_data {
 	bool rcu_need_heavy_qs;		/* GP old, so heavy quiescent state! */
 	bool rcu_urgent_qs;		/* GP old need light quiescent state. */
 	bool rcu_forced_tick;		/* Forced tick to provide QS. */
+	bool rcu_forced_tick_exp;	/*   ... provide QS to expedited GP. */
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	bool all_lazy;			/* All CPU's CBs lazy at idle start? */
 	unsigned long last_accelerate;	/* Last jiffy CBs were accelerated. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 081a17942e57..30b2a02aef39 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -230,7 +230,9 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
 static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
 				    unsigned long mask, bool wake)
 {
+	int cpu;
 	unsigned long flags;
+	struct rcu_data *rdp;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (!(rnp->expmask & mask)) {
@@ -238,6 +240,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
 		return;
 	}
 	WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
+	for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
+		rdp = per_cpu_ptr(&rcu_data, cpu);
+		if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp)
+			continue;
+		rdp->rcu_forced_tick_exp = false;
+		tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+	}
 	__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
 }
 
@@ -449,6 +458,26 @@ static void sync_rcu_exp_select_cpus(void)
 			flush_work(&rnp->rew.rew_work);
 }
 
+/*
+ * Wait for the expedited grace period to elapse, within time limit.
+ * If the time limit is exceeded without the grace period elapsing,
+ * return false, otherwise return true.
+ */
+static bool synchronize_rcu_expedited_wait_once(long tlimit)
+{
+	int t;
+	struct rcu_node *rnp_root = rcu_get_root();
+
+	t = swait_event_timeout_exclusive(rcu_state.expedited_wq,
+					  sync_rcu_exp_done_unlocked(rnp_root),
+					  tlimit);
+	// Workqueues should not be signaled.
+	if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root))
+		return true;
+	WARN_ON(t < 0);  /* workqueues should not be signaled. */
+	return false;
+}
+
 /*
  * Wait for the expedited grace period to elapse, issuing any needed
  * RCU CPU stall warnings along the way.
@@ -460,22 +489,31 @@ static void synchronize_rcu_expedited_wait(void)
 	unsigned long jiffies_start;
 	unsigned long mask;
 	int ndetected;
+	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_root = rcu_get_root();
-	int ret;
 
 	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
 	jiffies_stall = rcu_jiffies_till_stall_check();
 	jiffies_start = jiffies;
+	if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+		if (synchronize_rcu_expedited_wait_once(1))
+			return;
+		rcu_for_each_leaf_node(rnp) {
+			for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+				rdp = per_cpu_ptr(&rcu_data, cpu);
+				if (rdp->rcu_forced_tick_exp)
+					continue;
+				rdp->rcu_forced_tick_exp = true;
+				tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+			}
+		}
+		WARN_ON_ONCE(1);
+	}
 
 	for (;;) {
-		ret = swait_event_timeout_exclusive(
-				rcu_state.expedited_wq,
-				sync_rcu_exp_done_unlocked(rnp_root),
-				jiffies_stall);
-		if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root))
+		if (synchronize_rcu_expedited_wait_once(jiffies_stall))
 			return;
-		WARN_ON(ret < 0);  /* workqueues should not be signaled. */
 		if (rcu_cpu_stall_suppress)
 			continue;
 		panic_on_rcu_stall();

From f452ee096d95482892b101bde4fd037fa025d3cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= <j.neuschaefer@gmx.net>
Date: Fri, 4 Oct 2019 23:54:02 +0200
Subject: [PATCH 11/69] rculist: Describe variadic macro argument in a
 Sphinx-compatible way
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this patch, Sphinx shows "variable arguments" as the description
of the cond argument, rather than the intended description, and prints
the following warnings:

./include/linux/rculist.h:374: warning: Excess function parameter 'cond' description in 'list_for_each_entry_rcu'
./include/linux/rculist.h:651: warning: Excess function parameter 'cond' description in 'hlist_for_each_entry_rcu'

Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Acked-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rculist.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4158b7212936..61c6728a71f7 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -361,7 +361,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
  * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the list_head within the struct.
- * @cond:	optional lockdep expression if called from non-RCU protection.
+ * @cond...:	optional lockdep expression if called from non-RCU protection.
  *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as list_add_rcu()
@@ -636,7 +636,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the hlist_node within the struct.
- * @cond:	optional lockdep expression if called from non-RCU protection.
+ * @cond...:	optional lockdep expression if called from non-RCU protection.
  *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as hlist_add_head_rcu()

From c54a2744497db4b6887b9c905ef7aa0b3620c956 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2019 11:37:37 -0800
Subject: [PATCH 12/69] list: Add hlist_unhashed_lockless()

We would like to use hlist_unhashed() from timer_pending(),
which runs without protection of a lock.

Note that other callers might also want to use this variant.

Instead of forcing a READ_ONCE() for all hlist_unhashed()
callers, add a new helper with an explicit _lockless suffix
in the name to better document what is going on.

Also add various WRITE_ONCE() in __hlist_del(), hlist_add_head()
and hlist_add_before()/hlist_add_behind() to pair with
the READ_ONCE().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
[ paulmck: Also add WRITE_ONCE() to rculist.h. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/list.h    | 32 +++++++++++++++++++++-----------
 include/linux/rculist.h | 24 ++++++++++++------------
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 85c92555e31f..61f5aaf96192 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -749,6 +749,16 @@ static inline int hlist_unhashed(const struct hlist_node *h)
 	return !h->pprev;
 }
 
+/* This variant of hlist_unhashed() must be used in lockless contexts
+ * to avoid potential load-tearing.
+ * The READ_ONCE() is paired with the various WRITE_ONCE() in hlist
+ * helpers that are defined below.
+ */
+static inline int hlist_unhashed_lockless(const struct hlist_node *h)
+{
+	return !READ_ONCE(h->pprev);
+}
+
 static inline int hlist_empty(const struct hlist_head *h)
 {
 	return !READ_ONCE(h->first);
@@ -761,7 +771,7 @@ static inline void __hlist_del(struct hlist_node *n)
 
 	WRITE_ONCE(*pprev, next);
 	if (next)
-		next->pprev = pprev;
+		WRITE_ONCE(next->pprev, pprev);
 }
 
 static inline void hlist_del(struct hlist_node *n)
@@ -782,32 +792,32 @@ static inline void hlist_del_init(struct hlist_node *n)
 static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 {
 	struct hlist_node *first = h->first;
-	n->next = first;
+	WRITE_ONCE(n->next, first);
 	if (first)
-		first->pprev = &n->next;
+		WRITE_ONCE(first->pprev, &n->next);
 	WRITE_ONCE(h->first, n);
-	n->pprev = &h->first;
+	WRITE_ONCE(n->pprev, &h->first);
 }
 
 /* next must be != NULL */
 static inline void hlist_add_before(struct hlist_node *n,
 					struct hlist_node *next)
 {
-	n->pprev = next->pprev;
-	n->next = next;
-	next->pprev = &n->next;
+	WRITE_ONCE(n->pprev, next->pprev);
+	WRITE_ONCE(n->next, next);
+	WRITE_ONCE(next->pprev, &n->next);
 	WRITE_ONCE(*(n->pprev), n);
 }
 
 static inline void hlist_add_behind(struct hlist_node *n,
 				    struct hlist_node *prev)
 {
-	n->next = prev->next;
-	prev->next = n;
-	n->pprev = &prev->next;
+	WRITE_ONCE(n->next, prev->next);
+	WRITE_ONCE(prev->next, n);
+	WRITE_ONCE(n->pprev, &prev->next);
 
 	if (n->next)
-		n->next->pprev  = &n->next;
+		WRITE_ONCE(n->next->pprev, &n->next);
 }
 
 /* after that we'll appear to be on some hlist and hlist_del will work */
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 61c6728a71f7..4b7ae1bf50b3 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -173,7 +173,7 @@ static inline void hlist_del_init_rcu(struct hlist_node *n)
 {
 	if (!hlist_unhashed(n)) {
 		__hlist_del(n);
-		n->pprev = NULL;
+		WRITE_ONCE(n->pprev, NULL);
 	}
 }
 
@@ -473,7 +473,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
 static inline void hlist_del_rcu(struct hlist_node *n)
 {
 	__hlist_del(n);
-	n->pprev = LIST_POISON2;
+	WRITE_ONCE(n->pprev, LIST_POISON2);
 }
 
 /**
@@ -489,11 +489,11 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
 	struct hlist_node *next = old->next;
 
 	new->next = next;
-	new->pprev = old->pprev;
+	WRITE_ONCE(new->pprev, old->pprev);
 	rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
 	if (next)
-		new->next->pprev = &new->next;
-	old->pprev = LIST_POISON2;
+		WRITE_ONCE(new->next->pprev, &new->next);
+	WRITE_ONCE(old->pprev, LIST_POISON2);
 }
 
 /*
@@ -528,10 +528,10 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
 	struct hlist_node *first = h->first;
 
 	n->next = first;
-	n->pprev = &h->first;
+	WRITE_ONCE(n->pprev, &h->first);
 	rcu_assign_pointer(hlist_first_rcu(h), n);
 	if (first)
-		first->pprev = &n->next;
+		WRITE_ONCE(first->pprev, &n->next);
 }
 
 /**
@@ -564,7 +564,7 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n,
 
 	if (last) {
 		n->next = last->next;
-		n->pprev = &last->next;
+		WRITE_ONCE(n->pprev, &last->next);
 		rcu_assign_pointer(hlist_next_rcu(last), n);
 	} else {
 		hlist_add_head_rcu(n, h);
@@ -592,10 +592,10 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n,
 static inline void hlist_add_before_rcu(struct hlist_node *n,
 					struct hlist_node *next)
 {
-	n->pprev = next->pprev;
+	WRITE_ONCE(n->pprev, next->pprev);
 	n->next = next;
 	rcu_assign_pointer(hlist_pprev_rcu(n), n);
-	next->pprev = &n->next;
+	WRITE_ONCE(next->pprev, &n->next);
 }
 
 /**
@@ -620,10 +620,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
 					struct hlist_node *prev)
 {
 	n->next = prev->next;
-	n->pprev = &prev->next;
+	WRITE_ONCE(n->pprev, &prev->next);
 	rcu_assign_pointer(hlist_next_rcu(prev), n);
 	if (n->next)
-		n->next->pprev = &n->next;
+		WRITE_ONCE(n->next->pprev, &n->next);
 }
 
 #define __hlist_for_each_rcu(pos, head)				\

From 610dea36d3083a977e4f156206cbe1eaa2a532f0 Mon Sep 17 00:00:00 2001
From: Stefan Reiter <stefan@pimaker.at>
Date: Fri, 4 Oct 2019 19:49:10 +0000
Subject: [PATCH 13/69] rcu/nocb: Fix dump_tree hierarchy print always active

Commit 18cd8c93e69e ("rcu/nocb: Print gp/cb kthread hierarchy if
dump_tree") added print statements to rcu_organize_nocb_kthreads for
debugging, but incorrectly guarded them, causing the function to always
spew out its message.

This patch fixes it by guarding both pr_alert statements with dump_tree,
while also changing the second pr_alert to a pr_cont, to print the
hierarchy in a single line (assuming that's how it was supposed to
work).

Fixes: 18cd8c93e69e ("rcu/nocb: Print gp/cb kthread hierarchy if dump_tree")
Signed-off-by: Stefan Reiter <stefan@pimaker.at>
[ paulmck: Make single-nocbs-CPU GP kthreads look less erroneous. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fa08d55f7040..758bfe1de536 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2321,6 +2321,8 @@ static void __init rcu_organize_nocb_kthreads(void)
 {
 	int cpu;
 	bool firsttime = true;
+	bool gotnocbs = false;
+	bool gotnocbscbs = true;
 	int ls = rcu_nocb_gp_stride;
 	int nl = 0;  /* Next GP kthread. */
 	struct rcu_data *rdp;
@@ -2343,21 +2345,31 @@ static void __init rcu_organize_nocb_kthreads(void)
 		rdp = per_cpu_ptr(&rcu_data, cpu);
 		if (rdp->cpu >= nl) {
 			/* New GP kthread, set up for CBs & next GP. */
+			gotnocbs = true;
 			nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
 			rdp->nocb_gp_rdp = rdp;
 			rdp_gp = rdp;
-			if (!firsttime && dump_tree)
-				pr_cont("\n");
-			firsttime = false;
-			pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu);
+			if (dump_tree) {
+				if (!firsttime)
+					pr_cont("%s\n", gotnocbscbs
+							? "" : " (self only)");
+				gotnocbscbs = false;
+				firsttime = false;
+				pr_alert("%s: No-CB GP kthread CPU %d:",
+					 __func__, cpu);
+			}
 		} else {
 			/* Another CB kthread, link to previous GP kthread. */
+			gotnocbscbs = true;
 			rdp->nocb_gp_rdp = rdp_gp;
 			rdp_prev->nocb_next_cb_rdp = rdp;
-			pr_alert(" %d", cpu);
+			if (dump_tree)
+				pr_cont(" %d", cpu);
 		}
 		rdp_prev = rdp;
 	}
+	if (gotnocbs && dump_tree)
+		pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
 }
 
 /*

From 6935c3983b246d5fbfebd3b891c825e65c118f2d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Oct 2019 14:21:54 -0700
Subject: [PATCH 14/69] rcu: Avoid data-race in rcu_gp_fqs_check_wake()

The rcu_gp_fqs_check_wake() function uses rcu_preempt_blocked_readers_cgp()
to read ->gp_tasks while other cpus might overwrite this field.

We need READ_ONCE()/WRITE_ONCE() pairs to avoid compiler
tricks and KCSAN splats like the following :

BUG: KCSAN: data-race in rcu_gp_fqs_check_wake / rcu_preempt_deferred_qs_irqrestore

write to 0xffffffff85a7f190 of 8 bytes by task 7317 on cpu 0:
 rcu_preempt_deferred_qs_irqrestore+0x43d/0x580 kernel/rcu/tree_plugin.h:507
 rcu_read_unlock_special+0xec/0x370 kernel/rcu/tree_plugin.h:659
 __rcu_read_unlock+0xcf/0xe0 kernel/rcu/tree_plugin.h:394
 rcu_read_unlock include/linux/rcupdate.h:645 [inline]
 __ip_queue_xmit+0x3b0/0xa40 net/ipv4/ip_output.c:533
 ip_queue_xmit+0x45/0x60 include/net/ip.h:236
 __tcp_transmit_skb+0xdeb/0x1cd0 net/ipv4/tcp_output.c:1158
 __tcp_send_ack+0x246/0x300 net/ipv4/tcp_output.c:3685
 tcp_send_ack+0x34/0x40 net/ipv4/tcp_output.c:3691
 tcp_cleanup_rbuf+0x130/0x360 net/ipv4/tcp.c:1575
 tcp_recvmsg+0x633/0x1a30 net/ipv4/tcp.c:2179
 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838
 sock_recvmsg_nosec net/socket.c:871 [inline]
 sock_recvmsg net/socket.c:889 [inline]
 sock_recvmsg+0x92/0xb0 net/socket.c:885
 sock_read_iter+0x15f/0x1e0 net/socket.c:967
 call_read_iter include/linux/fs.h:1864 [inline]
 new_sync_read+0x389/0x4f0 fs/read_write.c:414

read to 0xffffffff85a7f190 of 8 bytes by task 10 on cpu 1:
 rcu_gp_fqs_check_wake kernel/rcu/tree.c:1556 [inline]
 rcu_gp_fqs_check_wake+0x93/0xd0 kernel/rcu/tree.c:1546
 rcu_gp_fqs_loop+0x36c/0x580 kernel/rcu/tree.c:1611
 rcu_gp_kthread+0x143/0x220 kernel/rcu/tree.c:1768
 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 10 Comm: rcu_preempt Not tainted 5.3.0+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
[ paulmck:  Added another READ_ONCE() for RCU CPU stall warnings. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 758bfe1de536..fe5f44811761 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -220,7 +220,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * blocked tasks.
 	 */
 	if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
-		rnp->gp_tasks = &t->rcu_node_entry;
+		WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
 		WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
 	}
 	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
@@ -340,7 +340,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  */
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 {
-	return rnp->gp_tasks != NULL;
+	return READ_ONCE(rnp->gp_tasks) != NULL;
 }
 
 /* Bias and limit values for ->rcu_read_lock_nesting. */
@@ -493,7 +493,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
 						rnp->gp_seq, t->pid);
 		if (&t->rcu_node_entry == rnp->gp_tasks)
-			rnp->gp_tasks = np;
+			WRITE_ONCE(rnp->gp_tasks, np);
 		if (&t->rcu_node_entry == rnp->exp_tasks)
 			rnp->exp_tasks = np;
 		if (IS_ENABLED(CONFIG_RCU_BOOST)) {
@@ -663,7 +663,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 		dump_blkd_tasks(rnp, 10);
 	if (rcu_preempt_has_tasks(rnp) &&
 	    (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
-		rnp->gp_tasks = rnp->blkd_tasks.next;
+		WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
 		t = container_of(rnp->gp_tasks, struct task_struct,
 				 rcu_node_entry);
 		trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
@@ -757,7 +757,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 		pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
 			__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
 	pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
-		__func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks);
+		__func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks,
+		rnp->exp_tasks);
 	pr_info("%s: ->blkd_tasks", __func__);
 	i = 0;
 	list_for_each(lhp, &rnp->blkd_tasks) {

From 03bd2983d7a9f898fd89f8f7215c3e56732d8ecd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 10 Oct 2019 09:05:27 -0700
Subject: [PATCH 15/69] rcu: Use lockdep rather than comment to enforce lock
 held

The rcu_preempt_check_blocked_tasks() function has a comment
that states that the rcu_node structure's ->lock must be held,
which might be informative, but which carries little weight if
not read.  This commit therefore removes this comment in favor of
raw_lockdep_assert_held_rcu_node(), which will complain quite
visibly if the required lock is not held.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fe5f44811761..ed54d36465e2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -648,8 +648,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
  * Check that the list of blocked tasks for the newly completed grace
  * period is in fact empty.  It is a serious bug to complete a grace
  * period that still has RCU readers blocked!  This function must be
- * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock
- * must be held by the caller.
+ * invoked -before- updating this rnp's ->gp_seq.
  *
  * Also, if there are blocked tasks on the list, they automatically
  * block the newly created grace period, so set up ->gp_tasks accordingly.
@@ -659,6 +658,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 	struct task_struct *t;
 
 	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
+	raw_lockdep_assert_held_rcu_node(rnp);
 	if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
 		dump_blkd_tasks(rnp, 10);
 	if (rcu_preempt_has_tasks(rnp) &&

From b3e627d3d5092a87fc9b9e37e341610cfecfbfdc Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Tue, 15 Oct 2019 02:55:57 +0000
Subject: [PATCH 16/69] rcu: Make PREEMPT_RCU be a modifier to TREE_RCU

Currently PREEMPT_RCU and TREE_RCU are mutually exclusive Kconfig
options.  But PREEMPT_RCU actually specifies a kind of TREE_RCU,
namely a preemptible TREE_RCU. This commit therefore makes PREEMPT_RCU
be a modifer to the TREE_RCU Kconfig option.  This has the benefit of
simplifying several of the #if expressions that formerly needed to
check both, but now need only check one or the other.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Lai Jiangshan <jiangshanlai@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h   |  4 ++--
 include/trace/events/rcu.h |  4 ++--
 kernel/rcu/Kconfig         | 13 +++++++------
 kernel/rcu/Makefile        |  1 -
 kernel/rcu/rcu.h           |  2 +-
 kernel/rcu/update.c        |  2 +-
 kernel/sysctl.c            |  2 +-
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 0b7506330c87..70a41cd8f58d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -167,7 +167,7 @@ do { \
  * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
  */
 
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+#if defined(CONFIG_TREE_RCU)
 #include <linux/rcutree.h>
 #elif defined(CONFIG_TINY_RCU)
 #include <linux/rcutiny.h>
@@ -601,7 +601,7 @@ do {									      \
  * read-side critical section that would block in a !PREEMPT kernel.
  * But if you want the full story, read on!
  *
- * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU),
+ * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
  * it is illegal to block while in an RCU read-side critical section.
  * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
  * kernel builds, RCU read-side critical sections may be preempted,
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 66122602bd08..85019cf4ed6c 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -41,7 +41,7 @@ TRACE_EVENT(rcu_utilization,
 	TP_printk("%s", __entry->s)
 );
 
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+#if defined(CONFIG_TREE_RCU)
 
 /*
  * Tracepoint for grace-period events.  Takes a string identifying the
@@ -432,7 +432,7 @@ TRACE_EVENT_RCU(rcu_fqs,
 		  __entry->cpu, __entry->qsevent)
 );
 
-#endif /* #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) */
+#endif /* #if defined(CONFIG_TREE_RCU) */
 
 /*
  * Tracepoint for dyntick-idle entry/exit events.  These take a string
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 7644eda17d62..0303934e6ef0 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
 
 config TREE_RCU
 	bool
-	default y if !PREEMPTION && SMP
+	default y if SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP system with hundreds or
@@ -17,6 +17,7 @@ config TREE_RCU
 config PREEMPT_RCU
 	bool
 	default y if PREEMPTION
+	select TREE_RCU
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
@@ -78,7 +79,7 @@ config TASKS_RCU
 	  user-mode execution as quiescent states.
 
 config RCU_STALL_COMMON
-	def_bool ( TREE_RCU || PREEMPT_RCU )
+	def_bool TREE_RCU
 	help
 	  This option enables RCU CPU stall code that is common between
 	  the TINY and TREE variants of RCU.  The purpose is to allow
@@ -86,13 +87,13 @@ config RCU_STALL_COMMON
 	  making these warnings mandatory for the tree variants.
 
 config RCU_NEED_SEGCBLIST
-	def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
+	def_bool ( TREE_RCU || TREE_SRCU )
 
 config RCU_FANOUT
 	int "Tree-based hierarchical RCU fanout value"
 	range 2 64 if 64BIT
 	range 2 32 if !64BIT
-	depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+	depends on TREE_RCU && RCU_EXPERT
 	default 64 if 64BIT
 	default 32 if !64BIT
 	help
@@ -112,7 +113,7 @@ config RCU_FANOUT_LEAF
 	int "Tree-based hierarchical RCU leaf-level fanout value"
 	range 2 64 if 64BIT
 	range 2 32 if !64BIT
-	depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+	depends on TREE_RCU && RCU_EXPERT
 	default 16
 	help
 	  This option controls the leaf-level fanout of hierarchical
@@ -187,7 +188,7 @@ config RCU_BOOST_DELAY
 
 config RCU_NOCB_CPU
 	bool "Offload RCU callback processing from boot-selected CPUs"
-	depends on TREE_RCU || PREEMPT_RCU
+	depends on TREE_RCU
 	depends on RCU_EXPERT || NO_HZ_FULL
 	default n
 	help
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 020e8b6a644b..82d5fba48b2f 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -9,6 +9,5 @@ obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_PREEMPT_RCU) += tree.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
 obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ab504fbc76ca..eabafde2349e 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -454,7 +454,7 @@ enum rcutorture_type {
 	INVALID_RCU_FLAVOR
 };
 
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+#if defined(CONFIG_TREE_RCU)
 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 			    unsigned long *gp_seq);
 void do_trace_rcu_torture_read(const char *rcutorturename,
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1861103662db..34a7452b25fd 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -435,7 +435,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE)
 void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
 			       unsigned long secs,
 			       unsigned long c_old, unsigned long c)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 70665934d53e..d396aaaf19a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1268,7 +1268,7 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_do_static_key,
 	},
 #endif
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+#if defined(CONFIG_TREE_RCU)
 	{
 		.procname	= "panic_on_rcu_stall",
 		.data		= &sysctl_panic_on_rcu_stall,

From 90326f0521a88004194f88f1b597b54347482b5c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 15 Oct 2019 21:18:14 +0200
Subject: [PATCH 17/69] rcu: Use CONFIG_PREEMPTION where appropriate

The config option `CONFIG_PREEMPT' is used for the preemption model
"Low-Latency Desktop". The config option `CONFIG_PREEMPTION' is enabled
when kernel preemption is enabled which is true for the preemption model
`CONFIG_PREEMPT' and `CONFIG_PREEMPT_RT'.

Use `CONFIG_PREEMPTION' if it applies to both preemption models and not
just to `CONFIG_PREEMPT'.

Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: rcu@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 4 ++--
 kernel/rcu/Kconfig       | 4 ++--
 kernel/rcu/rcutorture.c  | 2 +-
 kernel/rcu/srcutiny.c    | 2 +-
 kernel/rcu/tree.c        | 4 ++--
 kernel/rcu/tree_exp.h    | 2 +-
 kernel/rcu/tree_plugin.h | 4 ++--
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 70a41cd8f58d..eb32fff81c30 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -154,7 +154,7 @@ static inline void exit_tasks_rcu_finish(void) { }
  *
  * This macro resembles cond_resched(), except that it is defined to
  * report potential quiescent states to RCU-tasks even if the cond_resched()
- * machinery were to be shut off, as some advocate for PREEMPT kernels.
+ * machinery were to be shut off, as some advocate for PREEMPTION kernels.
  */
 #define cond_resched_tasks_rcu_qs() \
 do { \
@@ -598,7 +598,7 @@ do {									      \
  *
  * You can avoid reading and understanding the next paragraph by
  * following this rule: don't put anything in an rcu_read_lock() RCU
- * read-side critical section that would block in a !PREEMPT kernel.
+ * read-side critical section that would block in a !PREEMPTION kernel.
  * But if you want the full story, read on!
  *
  * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 0303934e6ef0..1cc940fef17c 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -201,8 +201,8 @@ config RCU_NOCB_CPU
 	  specified at boot time by the rcu_nocbs parameter.  For each
 	  such CPU, a kthread ("rcuox/N") will be created to invoke
 	  callbacks, where the "N" is the CPU being offloaded, and where
-	  the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched
-	  (!PREEMPT kernels).  Nothing prevents this kthread from running
+	  the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched
+	  (!PREEMPTION kernels).  Nothing prevents this kthread from running
 	  on the specified CPUs, but (1) the kthreads may be preempted
 	  between each callback, and (2) affinity or cgroups can be used
 	  to force the kthreads to run on whatever set of CPUs is desired.
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index dee043feb71f..121a0507a7ce 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1730,7 +1730,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
 // Give the scheduler a chance, even on nohz_full CPUs.
 static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
 {
-	if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+	if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
 		// Real call_rcu() floods hit userspace, so emulate that.
 		if (need_resched() || (iter & 0xfff))
 			schedule();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 44d6606b8325..6208c1dae5c9 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
 /*
  * Workqueue handler to drive one grace period and invoke any callbacks
- * that become ready as a result.  Single-CPU and !PREEMPT operation
+ * that become ready as a result.  Single-CPU and !PREEMPTION operation
  * means that we get away with murder on synchronization.  ;-)
  */
 void srcu_drive_gp(struct work_struct *wp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1694a6b57ad8..c9dbb05e4c13 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2698,9 +2698,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
 
 /*
  * During early boot, any blocking grace-period wait automatically
- * implies a grace period.  Later on, this is never the case for PREEMPT.
+ * implies a grace period.  Later on, this is never the case for PREEMPTION.
  *
- * Howevr, because a context switch is a grace period for !PREEMPT, any
+ * Howevr, because a context switch is a grace period for !PREEMPTION, any
  * blocking grace-period wait automatically implies a grace period if
  * there is only one CPU online at any point time during execution of
  * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d632cd019597..98d078cafa5a 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -670,7 +670,7 @@ static void rcu_exp_handler(void *unused)
 	}
 }
 
-/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */
+/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
 static void sync_sched_exp_online_cleanup(int cpu)
 {
 }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ed54d36465e2..8cdce111ea73 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -789,7 +789,7 @@ static void __init rcu_bootup_announce(void)
 }
 
 /*
- * Note a quiescent state for PREEMPT=n.  Because we do not need to know
+ * Note a quiescent state for PREEMPTION=n.  Because we do not need to know
  * how many quiescent states passed, just if there was at least one since
  * the start of the grace period, this just sets a flag.  The caller must
  * have disabled preemption.
@@ -839,7 +839,7 @@ void rcu_all_qs(void)
 EXPORT_SYMBOL_GPL(rcu_all_qs);
 
 /*
- * Note a PREEMPT=n context switch.  The caller must have disabled interrupts.
+ * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
  */
 void rcu_note_context_switch(bool preempt)
 {

From c493f1c9c4094afae7f3a0693ae395f3859022b4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 6 Oct 2019 14:33:22 -0700
Subject: [PATCH 18/69] torture: Use gawk instead of awk for systime() function

In many environments, gawk provides systime(), but awk doesn't.
This commit therefore changes awk scripts using systime() to instead be
gawk scripts.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/jitter.sh         | 4 ++--
 tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh
index dc49a3ba6111..86a217b41b9f 100755
--- a/tools/testing/selftests/rcutorture/bin/jitter.sh
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -23,12 +23,12 @@ spinmax=${4-1000}
 
 n=1
 
-starttime=`awk 'BEGIN { print systime(); }' < /dev/null`
+starttime=`gawk 'BEGIN { print systime(); }' < /dev/null`
 
 while :
 do
 	# Check for done.
-	t=`awk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null`
+	t=`gawk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null`
 	if test "$t" -gt "$duration"
 	then
 		exit 0;
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 33c669619736..1d98992d1c34 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -123,7 +123,7 @@ qemu_args=$5
 boot_args=$6
 
 cd $KVM
-kstarttime=`awk 'BEGIN { print systime() }' < /dev/null`
+kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null`
 if test -z "$TORTURE_BUILDONLY"
 then
 	echo ' ---' `date`: Starting kernel
@@ -177,7 +177,7 @@ do
 	then
 		qemu_pid=`cat "$resdir/qemu_pid"`
 	fi
-	kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
+	kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
 	if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1
 	then
 		if test $kruntime -ge $seconds
@@ -213,7 +213,7 @@ then
 	oldline="`tail $resdir/console.log`"
 	while :
 	do
-		kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
+		kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
 		if kill -0 $qemu_pid > /dev/null 2>&1
 		then
 			:

From 9aa55ec206a6841e297c9df7e737b3d57f048a82 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sat, 12 Oct 2019 15:29:02 -0700
Subject: [PATCH 19/69] rcutorture: Dispense with Dracut for initrd creation

The dracut scripting does not work on all platforms, and there are no
known failures from the init binary based on the statically linked C
program.  This commit therefore removes the dracut scripting so that the
statically linked C program is always used to create the init "script".

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../selftests/rcutorture/bin/mkinitrd.sh      | 55 +------------------
 1 file changed, 3 insertions(+), 52 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
index 6fa9bd1ddc09..38e424d2392c 100755
--- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
+++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
@@ -20,58 +20,9 @@ if [ -s "$D/initrd/init" ]; then
     exit 0
 fi
 
-T=${TMPDIR-/tmp}/mkinitrd.sh.$$
-trap 'rm -rf $T' 0 2
-mkdir $T
-
-cat > $T/init << '__EOF___'
-#!/bin/sh
-# Run in userspace a few milliseconds every second.  This helps to
-# exercise the NO_HZ_FULL portions of RCU.  The 192 instances of "a" was
-# empirically shown to give a nice multi-millisecond burst of user-mode
-# execution on a 2GHz CPU, as desired.  Modern CPUs will vary from a
-# couple of milliseconds up to perhaps 100 milliseconds, which is an
-# acceptable range.
-#
-# Why not calibrate an exact delay?  Because within this initrd, we
-# are restricted to Bourne-shell builtins, which as far as I know do not
-# provide any means of obtaining a fine-grained timestamp.
-
-a4="a a a a"
-a16="$a4 $a4 $a4 $a4"
-a64="$a16 $a16 $a16 $a16"
-a192="$a64 $a64 $a64"
-while :
-do
-	q=
-	for i in $a192
-	do
-		q="$q $i"
-	done
-	sleep 1
-done
-__EOF___
-
-# Try using dracut to create initrd
-if command -v dracut >/dev/null 2>&1
-then
-	echo Creating $D/initrd using dracut.
-	# Filesystem creation
-	dracut --force --no-hostonly --no-hostonly-cmdline --module "base" $T/initramfs.img
-	cd $D
-	mkdir -p initrd
-	cd initrd
-	zcat $T/initramfs.img | cpio -id
-	cp $T/init init
-	chmod +x init
-	echo Done creating $D/initrd using dracut
-	exit 0
-fi
-
-# No dracut, so create a C-language initrd/init program and statically
-# link it.  This results in a very small initrd, but might be a bit less
-# future-proof than dracut.
-echo "Could not find dracut, attempting C initrd"
+# Create a C-language initrd/init infinite-loop program and statically
+# link it.  This results in a very small initrd.
+echo "Creating a statically linked C-language initrd"
 cd $D
 mkdir -p initrd
 cd initrd

From 517f17aed0ce678dfa82d7dd4e2593fc1bac799c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 14 Oct 2019 07:05:38 -0700
Subject: [PATCH 20/69] torture: Handle jitter for CPUs that cannot be offlined

Currently, jitter.sh assumes that the underlying hypervisor will be
configured with all CPUs hotpluggable, with the possible exception
of CPU 0.  However, there are installations where the hypervisor
prohibits offlining, which breaks jitter.sh.  This commit therefore
lists the CPUs that cannot be offlined up front, and checks for the
case where no CPU can be offlined in the loop.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../selftests/rcutorture/bin/jitter.sh        | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh
index 86a217b41b9f..30cb5b27d32e 100755
--- a/tools/testing/selftests/rcutorture/bin/jitter.sh
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -25,6 +25,18 @@ n=1
 
 starttime=`gawk 'BEGIN { print systime(); }' < /dev/null`
 
+nohotplugcpus=
+for i in /sys/devices/system/cpu/cpu[0-9]*
+do
+	if test -f $i/online
+	then
+		:
+	else
+		curcpu=`echo $i | sed -e 's/^[^0-9]*//'`
+		nohotplugcpus="$nohotplugcpus $curcpu"
+	fi
+done
+
 while :
 do
 	# Check for done.
@@ -35,13 +47,15 @@ do
 	fi
 
 	# Set affinity to randomly selected online CPU
-	cpus=`grep 1 /sys/devices/system/cpu/*/online |
-		sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'`
-
-	# Do not leave out poor old cpu0 which may not be hot-pluggable
-	if [ ! -f "/sys/devices/system/cpu/cpu0/online" ]; then
-		cpus="0 $cpus"
+	if cpus=`grep 1 /sys/devices/system/cpu/*/online 2>&1 |
+		 sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'`
+	then
+		:
+	else
+		cpus=
 	fi
+	# Do not leave out non-hot-pluggable CPUs
+	cpus="$cpus $nohotplugcpus"
 
 	cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN {
 		srand(n + me + systime());

From b8dfff975c370912c7ac633ca3e4a812dcd38f96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 21 Oct 2019 08:38:00 -0700
Subject: [PATCH 21/69] torture: Handle systems lacking the mpstat command

The rcutorture scripting uses the mpstat command to determine how much
the system is being used, and adjusts make's -j argument accordingly.
However, mpstat isn't installed by default, so it would be good if the
scripting does something useful when mpstat isn't present.

This commit therefore makes the scripts assumes that if mpstat is not
present, they are free to use all the CPUs.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/cpus2use.sh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh
index 4e9485590c10..1dbfb62567d2 100755
--- a/tools/testing/selftests/rcutorture/bin/cpus2use.sh
+++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh
@@ -15,8 +15,15 @@ then
 	exit 0
 fi
 ncpus=`grep '^processor' /proc/cpuinfo | wc -l`
-idlecpus=`mpstat | tail -1 | \
-	awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'`
+if mpstat -V > /dev/null 2>&1
+then
+	idlecpus=`mpstat | tail -1 | \
+		awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'`
+else
+	# No mpstat command, so use all available CPUs.
+	echo The mpstat command is not available, so greedily using all CPUs.
+	idlecpus=$ncpus
+fi
 awk -v ncpus=$ncpus -v idlecpus=$idlecpus < /dev/null '
 BEGIN {
 	cpus2use = idlecpus;

From ebfbaa8dcc84eff146928ed59a8bd4bff932318e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 4 Nov 2019 12:02:12 -0800
Subject: [PATCH 22/69] rcutorture: Add worst-case call_rcu() forward-progress
 results

This commit adds the worst-case results from any call_rcu()
forward-progress tests to the rcutorture test-summary output.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
index 2a7f3f4756a7..9d9a41625dd9 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
@@ -25,6 +25,7 @@ stopstate="`grep 'End-test grace-period state: g' $i/console.log 2> /dev/null |
 	    tail -1 | sed -e 's/^\[[ 0-9.]*] //' |
 	    awk '{ print \"[\" $1 \" \" $5 \" \" $6 \" \" $7 \"]\"; }' |
 	    tr -d '\012\015'`"
+fwdprog="`grep 'rcu_torture_fwd_prog_cr Duration' $i/console.log 2> /dev/null | sed -e 's/^\[[^]]*] //' | sort -k15nr | head -1 | awk '{ print $14 " " $15 }'`"
 if test -z "$ngps"
 then
 	echo "$configfile ------- " $stopstate
@@ -39,7 +40,7 @@ else
 			BEGIN { print ngps / dur }' < /dev/null`
 		title="$title ($ngpsps/s)"
 	fi
-	echo $title $stopstate
+	echo $title $stopstate $fwdprog
 	nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | awk '{for (i=NF-8;i<=NF;i++) sum+=$i; } END {print sum}'`
 	if test -z "$nclosecalls"
 	then

From a289e608b3e740c15f623148c26cdec2d6698ce0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 5 Nov 2019 08:31:56 -0800
Subject: [PATCH 23/69] rcutorture: Pull callback forward-progress data into
 rcu_fwd struct

Now that RCU behaves reasonably well with the current single-kthread
call_rcu() forward-progress testing, it is time to add more kthreads.
This commit takes a first step towards that goal by wrapping what
will be the per-kthread data into a new rcu_fwd structure.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 103 ++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 45 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index dee043feb71f..22a75a4b6b40 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1663,23 +1663,34 @@ struct rcu_fwd_cb {
 	struct rcu_fwd_cb *rfc_next;
 	int rfc_gps;
 };
-static DEFINE_SPINLOCK(rcu_fwd_lock);
-static struct rcu_fwd_cb *rcu_fwd_cb_head;
-static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head;
-static long n_launders_cb;
-static unsigned long rcu_fwd_startat;
-static bool rcu_fwd_emergency_stop;
+
 #define MAX_FWD_CB_JIFFIES	(8 * HZ) /* Maximum CB test duration. */
 #define MIN_FWD_CB_LAUNDERS	3	/* This many CB invocations to count. */
 #define MIN_FWD_CBS_LAUNDERED	100	/* Number of counted CBs. */
 #define FWD_CBS_HIST_DIV	10	/* Histogram buckets/second. */
+#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
+
 struct rcu_launder_hist {
 	long n_launders;
 	unsigned long launder_gp_seq;
 };
-#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
-static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
-static unsigned long rcu_launder_gp_seq_start;
+
+struct rcu_fwd {
+	spinlock_t rcu_fwd_lock;
+	struct rcu_fwd_cb *rcu_fwd_cb_head;
+	struct rcu_fwd_cb **rcu_fwd_cb_tail;
+	long n_launders_cb;
+	unsigned long rcu_fwd_startat;
+	struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
+	unsigned long rcu_launder_gp_seq_start;
+};
+
+struct rcu_fwd rcu_fwds = {
+	.rcu_fwd_lock = __SPIN_LOCK_UNLOCKED(rcu_fwds.rcu_fwd_lock),
+	.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head,
+};
+
+bool rcu_fwd_emergency_stop;
 
 static void rcu_torture_fwd_cb_hist(void)
 {
@@ -1688,16 +1699,17 @@ static void rcu_torture_fwd_cb_hist(void)
 	int i;
 	int j;
 
-	for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
-		if (n_launders_hist[i].n_launders > 0)
+	for (i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; i > 0; i--)
+		if (rcu_fwds.n_launders_hist[i].n_launders > 0)
 			break;
 	pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
-		 __func__, jiffies - rcu_fwd_startat);
-	gps_old = rcu_launder_gp_seq_start;
+		 __func__, jiffies - rcu_fwds.rcu_fwd_startat);
+	gps_old = rcu_fwds.rcu_launder_gp_seq_start;
 	for (j = 0; j <= i; j++) {
-		gps = n_launders_hist[j].launder_gp_seq;
+		gps = rcu_fwds.n_launders_hist[j].launder_gp_seq;
 		pr_cont(" %ds/%d: %ld:%ld",
-			j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders,
+			j + 1, FWD_CBS_HIST_DIV,
+			rcu_fwds.n_launders_hist[j].n_launders,
 			rcutorture_seq_diff(gps, gps_old));
 		gps_old = gps;
 	}
@@ -1714,17 +1726,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
 
 	rfcp->rfc_next = NULL;
 	rfcp->rfc_gps++;
-	spin_lock_irqsave(&rcu_fwd_lock, flags);
-	rfcpp = rcu_fwd_cb_tail;
-	rcu_fwd_cb_tail = &rfcp->rfc_next;
+	spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags);
+	rfcpp = rcu_fwds.rcu_fwd_cb_tail;
+	rcu_fwds.rcu_fwd_cb_tail = &rfcp->rfc_next;
 	WRITE_ONCE(*rfcpp, rfcp);
-	WRITE_ONCE(n_launders_cb, n_launders_cb + 1);
-	i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
-	if (i >= ARRAY_SIZE(n_launders_hist))
-		i = ARRAY_SIZE(n_launders_hist) - 1;
-	n_launders_hist[i].n_launders++;
-	n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
-	spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+	WRITE_ONCE(rcu_fwds.n_launders_cb, rcu_fwds.n_launders_cb + 1);
+	i = ((jiffies - rcu_fwds.rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
+	if (i >= ARRAY_SIZE(rcu_fwds.n_launders_hist))
+		i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1;
+	rcu_fwds.n_launders_hist[i].n_launders++;
+	rcu_fwds.n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
+	spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags);
 }
 
 // Give the scheduler a chance, even on nohz_full CPUs.
@@ -1751,16 +1763,16 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
 	struct rcu_fwd_cb *rfcp;
 
 	for (;;) {
-		spin_lock_irqsave(&rcu_fwd_lock, flags);
-		rfcp = rcu_fwd_cb_head;
+		spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags);
+		rfcp = rcu_fwds.rcu_fwd_cb_head;
 		if (!rfcp) {
-			spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+			spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags);
 			break;
 		}
-		rcu_fwd_cb_head = rfcp->rfc_next;
-		if (!rcu_fwd_cb_head)
-			rcu_fwd_cb_tail = &rcu_fwd_cb_head;
-		spin_unlock_irqrestore(&rcu_fwd_lock, flags);
+		rcu_fwds.rcu_fwd_cb_head = rfcp->rfc_next;
+		if (!rcu_fwds.rcu_fwd_cb_head)
+			rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head;
+		spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags);
 		kfree(rfcp);
 		freed++;
 		rcu_torture_fwd_prog_cond_resched(freed);
@@ -1804,8 +1816,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
 	sd = cur_ops->stall_dur() + 1;
 	sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
 	dur = sd4 + torture_random(&trs) % (sd - sd4);
-	WRITE_ONCE(rcu_fwd_startat, jiffies);
-	stopat = rcu_fwd_startat + dur;
+	WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies);
+	stopat = rcu_fwds.rcu_fwd_startat + dur;
 	while (time_before(jiffies, stopat) &&
 	       !shutdown_time_arrived() &&
 	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
@@ -1864,23 +1876,23 @@ static void rcu_torture_fwd_prog_cr(void)
 	/* Loop continuously posting RCU callbacks. */
 	WRITE_ONCE(rcu_fwd_cb_nodelay, true);
 	cur_ops->sync(); /* Later readers see above write. */
-	WRITE_ONCE(rcu_fwd_startat, jiffies);
-	stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
+	WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies);
+	stopat = rcu_fwds.rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
 	n_launders = 0;
-	n_launders_cb = 0;
+	rcu_fwds.n_launders_cb = 0; // Hoist initialization for multi-kthread
 	n_launders_sa = 0;
 	n_max_cbs = 0;
 	n_max_gps = 0;
-	for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
-		n_launders_hist[i].n_launders = 0;
+	for (i = 0; i < ARRAY_SIZE(rcu_fwds.n_launders_hist); i++)
+		rcu_fwds.n_launders_hist[i].n_launders = 0;
 	cver = READ_ONCE(rcu_torture_current_version);
 	gps = cur_ops->get_gp_seq();
-	rcu_launder_gp_seq_start = gps;
+	rcu_fwds.rcu_launder_gp_seq_start = gps;
 	tick_dep_set_task(current, TICK_DEP_BIT_RCU);
 	while (time_before(jiffies, stopat) &&
 	       !shutdown_time_arrived() &&
 	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
-		rfcp = READ_ONCE(rcu_fwd_cb_head);
+		rfcp = READ_ONCE(rcu_fwds.rcu_fwd_cb_head);
 		rfcpn = NULL;
 		if (rfcp)
 			rfcpn = READ_ONCE(rfcp->rfc_next);
@@ -1888,7 +1900,7 @@ static void rcu_torture_fwd_prog_cr(void)
 			if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
 			    ++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
 				break;
-			rcu_fwd_cb_head = rfcpn;
+			rcu_fwds.rcu_fwd_cb_head = rfcpn;
 			n_launders++;
 			n_launders_sa++;
 		} else {
@@ -1910,7 +1922,7 @@ static void rcu_torture_fwd_prog_cr(void)
 		}
 	}
 	stoppedat = jiffies;
-	n_launders_cb_snap = READ_ONCE(n_launders_cb);
+	n_launders_cb_snap = READ_ONCE(rcu_fwds.n_launders_cb);
 	cver = READ_ONCE(rcu_torture_current_version) - cver;
 	gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
 	cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
@@ -1921,7 +1933,8 @@ static void rcu_torture_fwd_prog_cr(void)
 		WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
 		pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
 			 __func__,
-			 stoppedat - rcu_fwd_startat, jiffies - stoppedat,
+			 stoppedat - rcu_fwds.rcu_fwd_startat,
+			 jiffies - stoppedat,
 			 n_launders + n_max_cbs - n_launders_cb_snap,
 			 n_launders, n_launders_sa,
 			 n_max_gps, n_max_cbs, cver, gps);
@@ -1943,7 +1956,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
 	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
 	     __func__);
 	rcu_torture_fwd_cb_hist();
-	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
+	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2);
 	WRITE_ONCE(rcu_fwd_emergency_stop, true);
 	smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
 	pr_info("%s: Freed %lu RCU callbacks.\n",

From 6b1b832546067caac8c5833abf88fa082d253b2f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 5 Nov 2019 09:08:58 -0800
Subject: [PATCH 24/69] rcutorture: Thread rcu_fwd pointer through
 forward-progress functions

In order to add multiple kthreads, it will be necessary to allow
the various functions to operate on a pointer to their kthread's
rcu_fwd structure.  This commit therefore starts the process of
adding the needed "struct rcu_fwd" parameters and arguments to the
various callback forward-progress functions.

Note that rcutorture_oom_notify() and rcu_torture_fwd_cb_hist() will
eventually need to iterate over all kthreads' rcu_fwd structures.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 78 ++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 22a75a4b6b40..cc88ce910a6d 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1661,6 +1661,7 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp)
 struct rcu_fwd_cb {
 	struct rcu_head rh;
 	struct rcu_fwd_cb *rfc_next;
+	struct rcu_fwd *rfc_rfp;
 	int rfc_gps;
 };
 
@@ -1692,24 +1693,24 @@ struct rcu_fwd rcu_fwds = {
 
 bool rcu_fwd_emergency_stop;
 
-static void rcu_torture_fwd_cb_hist(void)
+static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
 {
 	unsigned long gps;
 	unsigned long gps_old;
 	int i;
 	int j;
 
-	for (i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1; i > 0; i--)
-		if (rcu_fwds.n_launders_hist[i].n_launders > 0)
+	for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
+		if (rfp->n_launders_hist[i].n_launders > 0)
 			break;
 	pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
-		 __func__, jiffies - rcu_fwds.rcu_fwd_startat);
-	gps_old = rcu_fwds.rcu_launder_gp_seq_start;
+		 __func__, jiffies - rfp->rcu_fwd_startat);
+	gps_old = rfp->rcu_launder_gp_seq_start;
 	for (j = 0; j <= i; j++) {
-		gps = rcu_fwds.n_launders_hist[j].launder_gp_seq;
+		gps = rfp->n_launders_hist[j].launder_gp_seq;
 		pr_cont(" %ds/%d: %ld:%ld",
 			j + 1, FWD_CBS_HIST_DIV,
-			rcu_fwds.n_launders_hist[j].n_launders,
+			rfp->n_launders_hist[j].n_launders,
 			rcutorture_seq_diff(gps, gps_old));
 		gps_old = gps;
 	}
@@ -1723,20 +1724,21 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
 	int i;
 	struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh);
 	struct rcu_fwd_cb **rfcpp;
+	struct rcu_fwd *rfp = rfcp->rfc_rfp;
 
 	rfcp->rfc_next = NULL;
 	rfcp->rfc_gps++;
-	spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags);
-	rfcpp = rcu_fwds.rcu_fwd_cb_tail;
-	rcu_fwds.rcu_fwd_cb_tail = &rfcp->rfc_next;
+	spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
+	rfcpp = rfp->rcu_fwd_cb_tail;
+	rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
 	WRITE_ONCE(*rfcpp, rfcp);
-	WRITE_ONCE(rcu_fwds.n_launders_cb, rcu_fwds.n_launders_cb + 1);
-	i = ((jiffies - rcu_fwds.rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
-	if (i >= ARRAY_SIZE(rcu_fwds.n_launders_hist))
-		i = ARRAY_SIZE(rcu_fwds.n_launders_hist) - 1;
-	rcu_fwds.n_launders_hist[i].n_launders++;
-	rcu_fwds.n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
-	spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags);
+	WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
+	i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
+	if (i >= ARRAY_SIZE(rfp->n_launders_hist))
+		i = ARRAY_SIZE(rfp->n_launders_hist) - 1;
+	rfp->n_launders_hist[i].n_launders++;
+	rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
+	spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
 }
 
 // Give the scheduler a chance, even on nohz_full CPUs.
@@ -1786,7 +1788,8 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
 }
 
 /* Carry out need_resched()/cond_resched() forward-progress testing. */
-static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
+static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
+				    int *tested, int *tested_tries)
 {
 	unsigned long cver;
 	unsigned long dur;
@@ -1816,8 +1819,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
 	sd = cur_ops->stall_dur() + 1;
 	sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
 	dur = sd4 + torture_random(&trs) % (sd - sd4);
-	WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies);
-	stopat = rcu_fwds.rcu_fwd_startat + dur;
+	WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
+	stopat = rfp->rcu_fwd_startat + dur;
 	while (time_before(jiffies, stopat) &&
 	       !shutdown_time_arrived() &&
 	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
@@ -1852,7 +1855,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
 }
 
 /* Carry out call_rcu() forward-progress testing. */
-static void rcu_torture_fwd_prog_cr(void)
+static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 {
 	unsigned long cver;
 	unsigned long flags;
@@ -1876,23 +1879,23 @@ static void rcu_torture_fwd_prog_cr(void)
 	/* Loop continuously posting RCU callbacks. */
 	WRITE_ONCE(rcu_fwd_cb_nodelay, true);
 	cur_ops->sync(); /* Later readers see above write. */
-	WRITE_ONCE(rcu_fwds.rcu_fwd_startat, jiffies);
-	stopat = rcu_fwds.rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
+	WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
+	stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
 	n_launders = 0;
-	rcu_fwds.n_launders_cb = 0; // Hoist initialization for multi-kthread
+	rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread
 	n_launders_sa = 0;
 	n_max_cbs = 0;
 	n_max_gps = 0;
-	for (i = 0; i < ARRAY_SIZE(rcu_fwds.n_launders_hist); i++)
-		rcu_fwds.n_launders_hist[i].n_launders = 0;
+	for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++)
+		rfp->n_launders_hist[i].n_launders = 0;
 	cver = READ_ONCE(rcu_torture_current_version);
 	gps = cur_ops->get_gp_seq();
-	rcu_fwds.rcu_launder_gp_seq_start = gps;
+	rfp->rcu_launder_gp_seq_start = gps;
 	tick_dep_set_task(current, TICK_DEP_BIT_RCU);
 	while (time_before(jiffies, stopat) &&
 	       !shutdown_time_arrived() &&
 	       !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
-		rfcp = READ_ONCE(rcu_fwds.rcu_fwd_cb_head);
+		rfcp = READ_ONCE(rfp->rcu_fwd_cb_head);
 		rfcpn = NULL;
 		if (rfcp)
 			rfcpn = READ_ONCE(rfcp->rfc_next);
@@ -1900,7 +1903,7 @@ static void rcu_torture_fwd_prog_cr(void)
 			if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
 			    ++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
 				break;
-			rcu_fwds.rcu_fwd_cb_head = rfcpn;
+			rfp->rcu_fwd_cb_head = rfcpn;
 			n_launders++;
 			n_launders_sa++;
 		} else {
@@ -1912,6 +1915,7 @@ static void rcu_torture_fwd_prog_cr(void)
 			n_max_cbs++;
 			n_launders_sa = 0;
 			rfcp->rfc_gps = 0;
+			rfcp->rfc_rfp = rfp;
 		}
 		cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
 		rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
@@ -1922,7 +1926,7 @@ static void rcu_torture_fwd_prog_cr(void)
 		}
 	}
 	stoppedat = jiffies;
-	n_launders_cb_snap = READ_ONCE(rcu_fwds.n_launders_cb);
+	n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
 	cver = READ_ONCE(rcu_torture_current_version) - cver;
 	gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
 	cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
@@ -1933,12 +1937,11 @@ static void rcu_torture_fwd_prog_cr(void)
 		WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
 		pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
 			 __func__,
-			 stoppedat - rcu_fwds.rcu_fwd_startat,
-			 jiffies - stoppedat,
+			 stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat,
 			 n_launders + n_max_cbs - n_launders_cb_snap,
 			 n_launders, n_launders_sa,
 			 n_max_gps, n_max_cbs, cver, gps);
-		rcu_torture_fwd_cb_hist();
+		rcu_torture_fwd_cb_hist(rfp);
 	}
 	schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
 	tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
@@ -1955,7 +1958,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
 {
 	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
 	     __func__);
-	rcu_torture_fwd_cb_hist();
+	rcu_torture_fwd_cb_hist(&rcu_fwds);
 	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2);
 	WRITE_ONCE(rcu_fwd_emergency_stop, true);
 	smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
@@ -1980,6 +1983,7 @@ static struct notifier_block rcutorture_oom_nb = {
 /* Carry out grace-period forward-progress testing. */
 static int rcu_torture_fwd_prog(void *args)
 {
+	struct rcu_fwd *rfp = args;
 	int tested = 0;
 	int tested_tries = 0;
 
@@ -1991,8 +1995,8 @@ static int rcu_torture_fwd_prog(void *args)
 		schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
 		WRITE_ONCE(rcu_fwd_emergency_stop, false);
 		register_oom_notifier(&rcutorture_oom_nb);
-		rcu_torture_fwd_prog_nr(&tested, &tested_tries);
-		rcu_torture_fwd_prog_cr();
+		rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
+		rcu_torture_fwd_prog_cr(rfp);
 		unregister_oom_notifier(&rcutorture_oom_nb);
 
 		/* Avoid slow periods, better to test when busy. */
@@ -2027,7 +2031,7 @@ static int __init rcu_torture_fwd_prog_init(void)
 	if (fwd_progress_div <= 0)
 		fwd_progress_div = 4;
 	return torture_create_kthread(rcu_torture_fwd_prog,
-				      NULL, fwd_prog_task);
+				      &rcu_fwds, fwd_prog_task);
 }
 
 /* Callback function for RCU barrier testing. */

From 7beba0c06b588c725962bcc0273a489d46e81ccf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 6 Nov 2019 07:49:31 -0800
Subject: [PATCH 25/69] rcutorture: Move to dynamic initialization of rcu_fwds

In order to add multiple call_rcu() forward-progress kthreads, it will
be necessary to dynamically allocate and initialize.  This commit
therefore moves the initialization from compile time to instead
immediately precede thread-creation time.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cc88ce910a6d..6f540fed942c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1686,11 +1686,7 @@ struct rcu_fwd {
 	unsigned long rcu_launder_gp_seq_start;
 };
 
-struct rcu_fwd rcu_fwds = {
-	.rcu_fwd_lock = __SPIN_LOCK_UNLOCKED(rcu_fwds.rcu_fwd_lock),
-	.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head,
-};
-
+struct rcu_fwd rcu_fwds;
 bool rcu_fwd_emergency_stop;
 
 static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@ -2026,6 +2022,8 @@ static int __init rcu_torture_fwd_prog_init(void)
 		WARN_ON(1); /* Make sure rcutorture notices conflict. */
 		return 0;
 	}
+	spin_lock_init(&rcu_fwds.rcu_fwd_lock);
+	rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head;
 	if (fwd_progress_holdoff <= 0)
 		fwd_progress_holdoff = 1;
 	if (fwd_progress_div <= 0)

From 6764100bd2927060aae91b40fce015f39fc4fd87 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 6 Nov 2019 08:20:20 -0800
Subject: [PATCH 26/69] rcutorture: Complete threading rcu_fwd pointers through
 functions

This commit threads pointers to rcu_fwd structures through the remaining
functions using rcu_fwds directly, namely rcu_torture_fwd_prog_cbfree(),
rcutorture_oom_notify() and rcu_torture_fwd_prog_init().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 6f540fed942c..394baac98ae0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1754,23 +1754,23 @@ static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
  * Free all callbacks on the rcu_fwd_cb_head list, either because the
  * test is over or because we hit an OOM event.
  */
-static unsigned long rcu_torture_fwd_prog_cbfree(void)
+static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp)
 {
 	unsigned long flags;
 	unsigned long freed = 0;
 	struct rcu_fwd_cb *rfcp;
 
 	for (;;) {
-		spin_lock_irqsave(&rcu_fwds.rcu_fwd_lock, flags);
-		rfcp = rcu_fwds.rcu_fwd_cb_head;
+		spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
+		rfcp = rfp->rcu_fwd_cb_head;
 		if (!rfcp) {
-			spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags);
+			spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
 			break;
 		}
-		rcu_fwds.rcu_fwd_cb_head = rfcp->rfc_next;
-		if (!rcu_fwds.rcu_fwd_cb_head)
-			rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head;
-		spin_unlock_irqrestore(&rcu_fwds.rcu_fwd_lock, flags);
+		rfp->rcu_fwd_cb_head = rfcp->rfc_next;
+		if (!rfp->rcu_fwd_cb_head)
+			rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+		spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
 		kfree(rfcp);
 		freed++;
 		rcu_torture_fwd_prog_cond_resched(freed);
@@ -1926,7 +1926,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 	cver = READ_ONCE(rcu_torture_current_version) - cver;
 	gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
 	cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
-	(void)rcu_torture_fwd_prog_cbfree();
+	(void)rcu_torture_fwd_prog_cbfree(rfp);
 
 	if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
 	    !shutdown_time_arrived()) {
@@ -1952,20 +1952,22 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 static int rcutorture_oom_notify(struct notifier_block *self,
 				 unsigned long notused, void *nfreed)
 {
+	struct rcu_fwd *rfp = &rcu_fwds;
+
 	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
 	     __func__);
-	rcu_torture_fwd_cb_hist(&rcu_fwds);
-	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwds.rcu_fwd_startat)) / 2);
+	rcu_torture_fwd_cb_hist(rfp);
+	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
 	WRITE_ONCE(rcu_fwd_emergency_stop, true);
 	smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
 	pr_info("%s: Freed %lu RCU callbacks.\n",
-		__func__, rcu_torture_fwd_prog_cbfree());
+		__func__, rcu_torture_fwd_prog_cbfree(rfp));
 	rcu_barrier();
 	pr_info("%s: Freed %lu RCU callbacks.\n",
-		__func__, rcu_torture_fwd_prog_cbfree());
+		__func__, rcu_torture_fwd_prog_cbfree(rfp));
 	rcu_barrier();
 	pr_info("%s: Freed %lu RCU callbacks.\n",
-		__func__, rcu_torture_fwd_prog_cbfree());
+		__func__, rcu_torture_fwd_prog_cbfree(rfp));
 	smp_mb(); /* Frees before return to avoid redoing OOM. */
 	(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
 	pr_info("%s returning after OOM processing.\n", __func__);
@@ -2008,6 +2010,8 @@ static int rcu_torture_fwd_prog(void *args)
 /* If forward-progress checking is requested and feasible, spawn the thread. */
 static int __init rcu_torture_fwd_prog_init(void)
 {
+	struct rcu_fwd *rfp = &rcu_fwds;
+
 	if (!fwd_progress)
 		return 0; /* Not requested, so don't do it. */
 	if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 ||
@@ -2022,14 +2026,13 @@ static int __init rcu_torture_fwd_prog_init(void)
 		WARN_ON(1); /* Make sure rcutorture notices conflict. */
 		return 0;
 	}
-	spin_lock_init(&rcu_fwds.rcu_fwd_lock);
-	rcu_fwds.rcu_fwd_cb_tail = &rcu_fwds.rcu_fwd_cb_head;
+	spin_lock_init(&rfp->rcu_fwd_lock);
+	rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
 	if (fwd_progress_holdoff <= 0)
 		fwd_progress_holdoff = 1;
 	if (fwd_progress_div <= 0)
 		fwd_progress_div = 4;
-	return torture_create_kthread(rcu_torture_fwd_prog,
-				      &rcu_fwds, fwd_prog_task);
+	return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
 }
 
 /* Callback function for RCU barrier testing. */

From 5155be9994e557618a8312389fb4e52dfbf28a3c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 6 Nov 2019 08:35:08 -0800
Subject: [PATCH 27/69] rcutorture: Dynamically allocate rcu_fwds structure

This commit switches from static structure to dynamic allocation
for rcu_fwds as another step towards providing multiple call_rcu()
forward-progress kthreads.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 394baac98ae0..f77f4d886cc1 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1686,7 +1686,7 @@ struct rcu_fwd {
 	unsigned long rcu_launder_gp_seq_start;
 };
 
-struct rcu_fwd rcu_fwds;
+struct rcu_fwd *rcu_fwds;
 bool rcu_fwd_emergency_stop;
 
 static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@ -1952,7 +1952,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 static int rcutorture_oom_notify(struct notifier_block *self,
 				 unsigned long notused, void *nfreed)
 {
-	struct rcu_fwd *rfp = &rcu_fwds;
+	struct rcu_fwd *rfp = rcu_fwds;
 
 	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
 	     __func__);
@@ -2010,7 +2010,7 @@ static int rcu_torture_fwd_prog(void *args)
 /* If forward-progress checking is requested and feasible, spawn the thread. */
 static int __init rcu_torture_fwd_prog_init(void)
 {
-	struct rcu_fwd *rfp = &rcu_fwds;
+	struct rcu_fwd *rfp;
 
 	if (!fwd_progress)
 		return 0; /* Not requested, so don't do it. */
@@ -2026,12 +2026,15 @@ static int __init rcu_torture_fwd_prog_init(void)
 		WARN_ON(1); /* Make sure rcutorture notices conflict. */
 		return 0;
 	}
-	spin_lock_init(&rfp->rcu_fwd_lock);
-	rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
 	if (fwd_progress_holdoff <= 0)
 		fwd_progress_holdoff = 1;
 	if (fwd_progress_div <= 0)
 		fwd_progress_div = 4;
+	rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
+	if (!rfp)
+		return -ENOMEM;
+	spin_lock_init(&rfp->rcu_fwd_lock);
+	rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
 	return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
 }
 

From 25b4da74a955bf956428ab29e54aadf4fffab0a3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 22 Nov 2019 06:14:21 -0800
Subject: [PATCH 28/69] torture: Allow "CFLIST" to specify default list of
 scenarios

On a large system, it can be convenient to tell rcutorture to run
several instances of the default scenarios.  Currently, this requires
explicitly listing them, for example, "--configs '2*SRCU-N 2*SRCU-P...'".
Although this works, it is rather inconvenient.

This commit therefore allows "CFLIST" to be specified to indicate the
default list of scenarios called out in the relevant CFLIST file, for
example, for RCU, tools/testing/selftests/rcutorture/configs/rcu/CFLIST.
In addition, multipliers may be used to run multiple instances of all
the scenarios.  For example, on a 256-CPU system, "--configs '3*CFLIST'"
would run three instances of each scenario concurrently with one CPU
left over.  Thus "--configs '3*CFLIST TINY01'" would exactly consume all
256 CPUs, which makes rcutorture's jitter feature more effective.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 72518580df23..e19151c6e5e5 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -198,9 +198,10 @@ fi
 
 CONFIGFRAG=${KVM}/configs/${TORTURE_SUITE}; export CONFIGFRAG
 
+defaultconfigs="`tr '\012' ' ' < $CONFIGFRAG/CFLIST`"
 if test -z "$configs"
 then
-	configs="`cat $CONFIGFRAG/CFLIST`"
+	configs=$defaultconfigs
 fi
 
 if test -z "$resdir"
@@ -209,7 +210,7 @@ then
 fi
 
 # Create a file of test-name/#cpus pairs, sorted by decreasing #cpus.
-touch $T/cfgcpu
+configs_derep=
 for CF in $configs
 do
 	case $CF in
@@ -222,15 +223,21 @@ do
 		CF1=$CF
 		;;
 	esac
+	for ((cur_rep=0;cur_rep<$config_reps;cur_rep++))
+	do
+		configs_derep="$configs_derep $CF1"
+	done
+done
+touch $T/cfgcpu
+configs_derep="`echo $configs_derep | sed -e "s/\<CFLIST\>/$defaultconfigs/g"`"
+for CF1 in $configs_derep
+do
 	if test -f "$CONFIGFRAG/$CF1"
 	then
 		cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1`
 		cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
 		cpu_count=`configfrag_boot_maxcpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
-		for ((cur_rep=0;cur_rep<$config_reps;cur_rep++))
-		do
-			echo $CF1 $cpu_count >> $T/cfgcpu
-		done
+		echo $CF1 $cpu_count >> $T/cfgcpu
 	else
 		echo "The --configs file $CF1 does not exist, terminating."
 		exit 1

From b22eb7cefb9d31cf862542f9cef90f97c0294842 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 Nov 2019 14:33:28 -0800
Subject: [PATCH 29/69] torture: Hoist calls to lscpu to higher-level kvm.sh
 script

On some kernels, concurrent calls to the lscpu command result in severe
slowdowns.  For example, on v4.16, a single lscpu invocation takes about
two milliseconds, four concurrent invocations more than two seconds,
and 16 concurrent invocations more than 20 seconds.  Given that the only
goal is to learn the number of CPUs, invoking lscpu but once suffices.
This commit therefore invokes lscpu early in kvm.sh execution, setting
the initial value of the TORTURE_ALLOTED_CPUS environment variable.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../selftests/rcutorture/bin/kvm-test-1-run.sh        |  7 +++----
 tools/testing/selftests/rcutorture/bin/kvm.sh         | 11 ++++++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 1d98992d1c34..e0352304b98b 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -133,11 +133,10 @@ fi
 qemu_args="-enable-kvm -nographic $qemu_args"
 cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment`
 cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"`
-vcpus=`identify_qemu_vcpus`
-if test $cpu_count -gt $vcpus
+if test "$cpu_count" -gt "$TORTURE_ALLOTED_CPUS"
 then
-	echo CPU count limited from $cpu_count to $vcpus | tee -a $resdir/Warnings
-	cpu_count=$vcpus
+	echo CPU count limited from $cpu_count to $TORTURE_ALLOTED_CPUS | tee -a $resdir/Warnings
+	cpu_count=$TORTURE_ALLOTED_CPUS
 fi
 qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`"
 
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index e19151c6e5e5..78d18ab8e954 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -24,7 +24,9 @@ dur=$((30*60))
 dryrun=""
 KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
 PATH=${KVM}/bin:$PATH; export PATH
-TORTURE_ALLOTED_CPUS=""
+. functions.sh
+
+TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`"
 TORTURE_DEFCONFIG=defconfig
 TORTURE_BOOT_IMAGE=""
 TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD
@@ -40,8 +42,6 @@ cpus=0
 ds=`date +%Y.%m.%d-%H:%M:%S`
 jitter="-1"
 
-. functions.sh
-
 usage () {
 	echo "Usage: $scriptname optional arguments:"
 	echo "       --bootargs kernel-boot-arguments"
@@ -93,6 +93,11 @@ do
 		checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--'
 		cpus=$2
 		TORTURE_ALLOTED_CPUS="$2"
+		max_cpus="`identify_qemu_vcpus`"
+		if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus"
+		then
+			TORTURE_ALLOTED_CPUS=$max_cpus
+		fi
 		shift
 		;;
 	--datestamp)

From 9ffdd7982417e2e227e295c4dea9cec652a71983 Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Date: Tue, 29 Oct 2019 01:54:17 +0530
Subject: [PATCH 30/69] doc: Convert arrayRCU.txt to arrayRCU.rst

This patch converts arrayRCU from .txt to .rst format, and also adds
it to the index.rst file.

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
[ paulmck: Trimmed trailing whitespace. ]
Tested-by: Phong Tran <tranmanphong@gmail.com>
Tested-by: Amol Grover <frextrite@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../RCU/{arrayRCU.txt => arrayRCU.rst}        | 34 +++++++++++++------
 Documentation/RCU/index.rst                   |  1 +
 2 files changed, 24 insertions(+), 11 deletions(-)
 rename Documentation/RCU/{arrayRCU.txt => arrayRCU.rst} (85%)

diff --git a/Documentation/RCU/arrayRCU.txt b/Documentation/RCU/arrayRCU.rst
similarity index 85%
rename from Documentation/RCU/arrayRCU.txt
rename to Documentation/RCU/arrayRCU.rst
index f05a9afb2c39..4051ea3871ef 100644
--- a/Documentation/RCU/arrayRCU.txt
+++ b/Documentation/RCU/arrayRCU.rst
@@ -1,19 +1,21 @@
-Using RCU to Protect Read-Mostly Arrays
+.. _array_rcu_doc:
 
+Using RCU to Protect Read-Mostly Arrays
+=======================================
 
 Although RCU is more commonly used to protect linked lists, it can
 also be used to protect arrays.  Three situations are as follows:
 
-1.  Hash Tables
+1.  :ref:`Hash Tables <hash_tables>`
 
-2.  Static Arrays
+2.  :ref:`Static Arrays <static_arrays>`
 
-3.  Resizeable Arrays
+3.  :ref:`Resizable Arrays <resizable_arrays>`
 
 Each of these three situations involves an RCU-protected pointer to an
 array that is separately indexed.  It might be tempting to consider use
 of RCU to instead protect the index into an array, however, this use
-case is -not- supported.  The problem with RCU-protected indexes into
+case is **not** supported.  The problem with RCU-protected indexes into
 arrays is that compilers can play way too many optimization games with
 integers, which means that the rules governing handling of these indexes
 are far more trouble than they are worth.  If RCU-protected indexes into
@@ -24,16 +26,20 @@ to be safely used.
 That aside, each of the three RCU-protected pointer situations are
 described in the following sections.
 
+.. _hash_tables:
 
 Situation 1: Hash Tables
+------------------------
 
 Hash tables are often implemented as an array, where each array entry
 has a linked-list hash chain.  Each hash chain can be protected by RCU
 as described in the listRCU.txt document.  This approach also applies
 to other array-of-list situations, such as radix trees.
 
+.. _static_arrays:
 
 Situation 2: Static Arrays
+--------------------------
 
 Static arrays, where the data (rather than a pointer to the data) is
 located in each array element, and where the array is never resized,
@@ -41,13 +47,17 @@ have not been used with RCU.  Rik van Riel recommends using seqlock in
 this situation, which would also have minimal read-side overhead as long
 as updates are rare.
 
-Quick Quiz:  Why is it so important that updates be rare when
-	     using seqlock?
+Quick Quiz:
+		Why is it so important that updates be rare when using seqlock?
 
+:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>`
 
-Situation 3: Resizeable Arrays
+.. _resizable_arrays:
 
-Use of RCU for resizeable arrays is demonstrated by the grow_ary()
+Situation 3: Resizable Arrays
+------------------------------
+
+Use of RCU for resizable arrays is demonstrated by the grow_ary()
 function formerly used by the System V IPC code.  The array is used
 to map from semaphore, message-queue, and shared-memory IDs to the data
 structure that represents the corresponding IPC construct.  The grow_ary()
@@ -60,7 +70,7 @@ the remainder of the new, updates the ids->entries pointer to point to
 the new array, and invokes ipc_rcu_putref() to free up the old array.
 Note that rcu_assign_pointer() is used to update the ids->entries pointer,
 which includes any memory barriers required on whatever architecture
-you are running on.
+you are running on::
 
 	static int grow_ary(struct ipc_ids* ids, int newsize)
 	{
@@ -112,7 +122,7 @@ a simple check suffices.  The pointer to the structure corresponding
 to the desired IPC object is placed in "out", with NULL indicating
 a non-existent entry.  After acquiring "out->lock", the "out->deleted"
 flag indicates whether the IPC object is in the process of being
-deleted, and, if not, the pointer is returned.
+deleted, and, if not, the pointer is returned::
 
 	struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
 	{
@@ -144,8 +154,10 @@ deleted, and, if not, the pointer is returned.
 		return out;
 	}
 
+.. _answer_quick_quiz_seqlock:
 
 Answer to Quick Quiz:
+	Why is it so important that updates be rare when using seqlock?
 
 	The reason that it is important that updates be rare when
 	using seqlock is that frequent updates can livelock readers.
diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 5c99185710fa..8d20d44f8fd4 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -7,6 +7,7 @@ RCU concepts
 .. toctree::
    :maxdepth: 3
 
+   arrayRCU
    rcu
    listRCU
    UP

From 6705cae433cffc37b183ded6ca9fe5c6d8ae8a9d Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Date: Tue, 29 Oct 2019 03:12:52 +0530
Subject: [PATCH 31/69] doc: Converted NMI-RCU.txt to NMI-RCU.rst.

This patch converts NMI-RCU from txt to rst format.
Also adds NMI-RCU in the index.rst file.

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
[ paulmck: Apply feedback from Phong Tran. ]
Tested-by: Phong Tran <tranmanphong@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../RCU/{NMI-RCU.txt => NMI-RCU.rst}          | 53 ++++++++++---------
 Documentation/RCU/index.rst                   |  1 +
 2 files changed, 29 insertions(+), 25 deletions(-)
 rename Documentation/RCU/{NMI-RCU.txt => NMI-RCU.rst} (73%)

diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.rst
similarity index 73%
rename from Documentation/RCU/NMI-RCU.txt
rename to Documentation/RCU/NMI-RCU.rst
index 881353fd5bff..180958388ff9 100644
--- a/Documentation/RCU/NMI-RCU.txt
+++ b/Documentation/RCU/NMI-RCU.rst
@@ -1,4 +1,7 @@
+.. _NMI_rcu_doc:
+
 Using RCU to Protect Dynamic NMI Handlers
+=========================================
 
 
 Although RCU is usually used to protect read-mostly data structures,
@@ -9,7 +12,7 @@ work in "arch/x86/oprofile/nmi_timer_int.c" and in
 "arch/x86/kernel/traps.c".
 
 The relevant pieces of code are listed below, each followed by a
-brief explanation.
+brief explanation::
 
 	static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
 	{
@@ -18,12 +21,12 @@ brief explanation.
 
 The dummy_nmi_callback() function is a "dummy" NMI handler that does
 nothing, but returns zero, thus saying that it did nothing, allowing
-the NMI handler to take the default machine-specific action.
+the NMI handler to take the default machine-specific action::
 
 	static nmi_callback_t nmi_callback = dummy_nmi_callback;
 
 This nmi_callback variable is a global function pointer to the current
-NMI handler.
+NMI handler::
 
 	void do_nmi(struct pt_regs * regs, long error_code)
 	{
@@ -53,11 +56,12 @@ anyway.  However, in practice it is a good documentation aid, particularly
 for anyone attempting to do something similar on Alpha or on systems
 with aggressive optimizing compilers.
 
-Quick Quiz:  Why might the rcu_dereference_sched() be necessary on Alpha,
-	     given that the code referenced by the pointer is read-only?
+Quick Quiz:
+		Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
 
+:ref:`Answer to Quick Quiz <answer_quick_quiz_NMI>`
 
-Back to the discussion of NMI and RCU...
+Back to the discussion of NMI and RCU::
 
 	void set_nmi_callback(nmi_callback_t callback)
 	{
@@ -68,7 +72,7 @@ The set_nmi_callback() function registers an NMI handler.  Note that any
 data that is to be used by the callback must be initialized up -before-
 the call to set_nmi_callback().  On architectures that do not order
 writes, the rcu_assign_pointer() ensures that the NMI handler sees the
-initialized values.
+initialized values::
 
 	void unset_nmi_callback(void)
 	{
@@ -82,7 +86,7 @@ up any data structures used by the old NMI handler until execution
 of it completes on all other CPUs.
 
 One way to accomplish this is via synchronize_rcu(), perhaps as
-follows:
+follows::
 
 	unset_nmi_callback();
 	synchronize_rcu();
@@ -98,24 +102,23 @@ to free up the handler's data as soon as synchronize_rcu() returns.
 Important note: for this to work, the architecture in question must
 invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
 
+.. _answer_quick_quiz_NMI:
 
-Answer to Quick Quiz
+Answer to Quick Quiz:
+	Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
 
-	Why might the rcu_dereference_sched() be necessary on Alpha, given
-	that the code referenced by the pointer is read-only?
+	The caller to set_nmi_callback() might well have
+	initialized some data that is to be used by the new NMI
+	handler.  In this case, the rcu_dereference_sched() would
+	be needed, because otherwise a CPU that received an NMI
+	just after the new handler was set might see the pointer
+	to the new NMI handler, but the old pre-initialized
+	version of the handler's data.
 
-	Answer: The caller to set_nmi_callback() might well have
-		initialized some data that is to be used by the new NMI
-		handler.  In this case, the rcu_dereference_sched() would
-		be needed, because otherwise a CPU that received an NMI
-		just after the new handler was set might see the pointer
-		to the new NMI handler, but the old pre-initialized
-		version of the handler's data.
+	This same sad story can happen on other CPUs when using
+	a compiler with aggressive pointer-value speculation
+	optimizations.
 
-		This same sad story can happen on other CPUs when using
-		a compiler with aggressive pointer-value speculation
-		optimizations.
-
-		More important, the rcu_dereference_sched() makes it
-		clear to someone reading the code that the pointer is
-		being protected by RCU-sched.
+	More important, the rcu_dereference_sched() makes it
+	clear to someone reading the code that the pointer is
+	being protected by RCU-sched.
diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 8d20d44f8fd4..627128c230dc 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -10,6 +10,7 @@ RCU concepts
    arrayRCU
    rcu
    listRCU
+   NMI-RCU
    UP
 
    Design/Memory-Ordering/Tree-RCU-Memory-Ordering

From 5e1bc932818f74082e8331b59aa550101ead08e0 Mon Sep 17 00:00:00 2001
From: Phong Tran <tranmanphong@gmail.com>
Date: Wed, 6 Nov 2019 20:09:50 +0700
Subject: [PATCH 32/69] doc: Convert whatisRCU.txt to .rst

This commit updates whatisRCU.txt to the new .rst format.
This change includes:

- Formatting bullet lists
- Adding literal blocks
- Links from table of contents to corresponding sections
- Links to external documents
- Reformat quick quizzes

Signed-off-by: Phong Tran <tranmanphong@gmail.com>
Tested-by: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
[ tranmanphong: Apply Amol Grover feedback. ]
Reviewed-by: Amol Grover <frextrite@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst                   |   1 +
 .../RCU/{whatisRCU.txt => whatisRCU.rst}      | 282 +++++++++++-------
 2 files changed, 177 insertions(+), 106 deletions(-)
 rename Documentation/RCU/{whatisRCU.txt => whatisRCU.rst} (84%)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 627128c230dc..b9b11481c727 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -8,6 +8,7 @@ RCU concepts
    :maxdepth: 3
 
    arrayRCU
+   whatisRCU
    rcu
    listRCU
    NMI-RCU
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.rst
similarity index 84%
rename from Documentation/RCU/whatisRCU.txt
rename to Documentation/RCU/whatisRCU.rst
index 58ba05c4d97f..2f6f6ebbc8b0 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.rst
@@ -1,15 +1,18 @@
+.. _whatisrcu_doc:
+
 What is RCU?  --  "Read, Copy, Update"
+======================================
 
 Please note that the "What is RCU?" LWN series is an excellent place
 to start learning about RCU:
 
-1.	What is RCU, Fundamentally?  http://lwn.net/Articles/262464/
-2.	What is RCU? Part 2: Usage   http://lwn.net/Articles/263130/
-3.	RCU part 3: the RCU API      http://lwn.net/Articles/264090/
-4.	The RCU API, 2010 Edition    http://lwn.net/Articles/418853/
-	2010 Big API Table           http://lwn.net/Articles/419086/
-5.	The RCU API, 2014 Edition    http://lwn.net/Articles/609904/
-	2014 Big API Table           http://lwn.net/Articles/609973/
+| 1.	What is RCU, Fundamentally?  http://lwn.net/Articles/262464/
+| 2.	What is RCU? Part 2: Usage   http://lwn.net/Articles/263130/
+| 3.	RCU part 3: the RCU API      http://lwn.net/Articles/264090/
+| 4.	The RCU API, 2010 Edition    http://lwn.net/Articles/418853/
+| 	2010 Big API Table           http://lwn.net/Articles/419086/
+| 5.	The RCU API, 2014 Edition    http://lwn.net/Articles/609904/
+|	2014 Big API Table           http://lwn.net/Articles/609973/
 
 
 What is RCU?
@@ -24,14 +27,21 @@ the experience has been that different people must take different paths
 to arrive at an understanding of RCU.  This document provides several
 different paths, as follows:
 
-1.	RCU OVERVIEW
-2.	WHAT IS RCU'S CORE API?
-3.	WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
-4.	WHAT IF MY UPDATING THREAD CANNOT BLOCK?
-5.	WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
-6.	ANALOGY WITH READER-WRITER LOCKING
-7.	FULL LIST OF RCU APIs
-8.	ANSWERS TO QUICK QUIZZES
+:ref:`1.	RCU OVERVIEW <1_whatisRCU>`
+
+:ref:`2.	WHAT IS RCU'S CORE API? <2_whatisRCU>`
+
+:ref:`3.	WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>`
+
+:ref:`4.	WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>`
+
+:ref:`5.	WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>`
+
+:ref:`6.	ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>`
+
+:ref:`7.	FULL LIST OF RCU APIs <7_whatisRCU>`
+
+:ref:`8.	ANSWERS TO QUICK QUIZZES <8_whatisRCU>`
 
 People who prefer starting with a conceptual overview should focus on
 Section 1, though most readers will profit by reading this section at
@@ -49,8 +59,10 @@ everything, feel free to read the whole thing -- but if you are really
 that type of person, you have perused the source code and will therefore
 never need this document anyway.  ;-)
 
+.. _1_whatisRCU:
 
 1.  RCU OVERVIEW
+----------------
 
 The basic idea behind RCU is to split updates into "removal" and
 "reclamation" phases.  The removal phase removes references to data items
@@ -116,8 +128,10 @@ So how the heck can a reclaimer tell when a reader is done, given
 that readers are not doing any sort of synchronization operations???
 Read on to learn about how RCU's API makes this easy.
 
+.. _2_whatisRCU:
 
 2.  WHAT IS RCU'S CORE API?
+---------------------------
 
 The core RCU API is quite small:
 
@@ -136,7 +150,7 @@ later.  See the kernel docbook documentation for more info, or look directly
 at the function header comments.
 
 rcu_read_lock()
-
+^^^^^^^^^^^^^^^
 	void rcu_read_lock(void);
 
 	Used by a reader to inform the reclaimer that the reader is
@@ -150,7 +164,7 @@ rcu_read_lock()
 	longer-term references to data structures.
 
 rcu_read_unlock()
-
+^^^^^^^^^^^^^^^^^
 	void rcu_read_unlock(void);
 
 	Used by a reader to inform the reclaimer that the reader is
@@ -158,15 +172,15 @@ rcu_read_unlock()
 	read-side critical sections may be nested and/or overlapping.
 
 synchronize_rcu()
-
+^^^^^^^^^^^^^^^^^
 	void synchronize_rcu(void);
 
 	Marks the end of updater code and the beginning of reclaimer
 	code.  It does this by blocking until all pre-existing RCU
 	read-side critical sections on all CPUs have completed.
-	Note that synchronize_rcu() will -not- necessarily wait for
+	Note that synchronize_rcu() will **not** necessarily wait for
 	any subsequent RCU read-side critical sections to complete.
-	For example, consider the following sequence of events:
+	For example, consider the following sequence of events::
 
 	         CPU 0                  CPU 1                 CPU 2
 	     ----------------- ------------------------- ---------------
@@ -182,7 +196,7 @@ synchronize_rcu()
 	any that begin after synchronize_rcu() is invoked.
 
 	Of course, synchronize_rcu() does not necessarily return
-	-immediately- after the last pre-existing RCU read-side critical
+	**immediately** after the last pre-existing RCU read-side critical
 	section completes.  For one thing, there might well be scheduling
 	delays.  For another thing, many RCU implementations process
 	requests in batches in order to improve efficiencies, which can
@@ -211,10 +225,10 @@ synchronize_rcu()
 	checklist.txt for some approaches to limiting the update rate.
 
 rcu_assign_pointer()
-
+^^^^^^^^^^^^^^^^^^^^
 	void rcu_assign_pointer(p, typeof(p) v);
 
-	Yes, rcu_assign_pointer() -is- implemented as a macro, though it
+	Yes, rcu_assign_pointer() **is** implemented as a macro, though it
 	would be cool to be able to declare a function in this manner.
 	(Compiler experts will no doubt disagree.)
 
@@ -231,7 +245,7 @@ rcu_assign_pointer()
 	the _rcu list-manipulation primitives such as list_add_rcu().
 
 rcu_dereference()
-
+^^^^^^^^^^^^^^^^^
 	typeof(p) rcu_dereference(p);
 
 	Like rcu_assign_pointer(), rcu_dereference() must be implemented
@@ -248,13 +262,13 @@ rcu_dereference()
 
 	Common coding practice uses rcu_dereference() to copy an
 	RCU-protected pointer to a local variable, then dereferences
-	this local variable, for example as follows:
+	this local variable, for example as follows::
 
 		p = rcu_dereference(head.next);
 		return p->data;
 
 	However, in this case, one could just as easily combine these
-	into one statement:
+	into one statement::
 
 		return rcu_dereference(head.next)->data;
 
@@ -266,8 +280,8 @@ rcu_dereference()
 	unnecessary overhead on Alpha CPUs.
 
 	Note that the value returned by rcu_dereference() is valid
-	only within the enclosing RCU read-side critical section [1].
-	For example, the following is -not- legal:
+	only within the enclosing RCU read-side critical section [1]_.
+	For example, the following is **not** legal::
 
 		rcu_read_lock();
 		p = rcu_dereference(head.next);
@@ -290,9 +304,9 @@ rcu_dereference()
 	at any time, including immediately after the rcu_dereference().
 	And, again like rcu_assign_pointer(), rcu_dereference() is
 	typically used indirectly, via the _rcu list-manipulation
-	primitives, such as list_for_each_entry_rcu() [2].
+	primitives, such as list_for_each_entry_rcu() [2]_.
 
-	[1] The variant rcu_dereference_protected() can be used outside
+.. 	[1] The variant rcu_dereference_protected() can be used outside
 	of an RCU read-side critical section as long as the usage is
 	protected by locks acquired by the update-side code.  This variant
 	avoids the lockdep warning that would happen when using (for
@@ -305,7 +319,7 @@ rcu_dereference()
 	a lockdep splat is emitted.  See Documentation/RCU/Design/Requirements/Requirements.rst
 	and the API's code comments for more details and example usage.
 
-	[2] If the list_for_each_entry_rcu() instance might be used by
+.. 	[2] If the list_for_each_entry_rcu() instance might be used by
 	update-side code as well as by RCU readers, then an additional
 	lockdep expression can be added to its list of arguments.
 	For example, given an additional "lock_is_held(&mylock)" argument,
@@ -315,6 +329,7 @@ rcu_dereference()
 
 The following diagram shows how each API communicates among the
 reader, updater, and reclaimer.
+::
 
 
 	    rcu_assign_pointer()
@@ -375,12 +390,16 @@ c.	RCU applied to scheduler and interrupt/NMI-handler tasks.
 Again, most uses will be of (a).  The (b) and (c) cases are important
 for specialized uses, but are relatively uncommon.
 
+.. _3_whatisRCU:
 
 3.  WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
+-----------------------------------------------
 
 This section shows a simple use of the core RCU API to protect a
 global pointer to a dynamically allocated structure.  More-typical
-uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
+uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`,
+:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`.
+::
 
 	struct foo {
 		int a;
@@ -440,40 +459,43 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
 
 So, to sum up:
 
-o	Use rcu_read_lock() and rcu_read_unlock() to guard RCU
+-	Use rcu_read_lock() and rcu_read_unlock() to guard RCU
 	read-side critical sections.
 
-o	Within an RCU read-side critical section, use rcu_dereference()
+-	Within an RCU read-side critical section, use rcu_dereference()
 	to dereference RCU-protected pointers.
 
-o	Use some solid scheme (such as locks or semaphores) to
+-	Use some solid scheme (such as locks or semaphores) to
 	keep concurrent updates from interfering with each other.
 
-o	Use rcu_assign_pointer() to update an RCU-protected pointer.
+-	Use rcu_assign_pointer() to update an RCU-protected pointer.
 	This primitive protects concurrent readers from the updater,
-	-not- concurrent updates from each other!  You therefore still
+	**not** concurrent updates from each other!  You therefore still
 	need to use locking (or something similar) to keep concurrent
 	rcu_assign_pointer() primitives from interfering with each other.
 
-o	Use synchronize_rcu() -after- removing a data element from an
-	RCU-protected data structure, but -before- reclaiming/freeing
+-	Use synchronize_rcu() **after** removing a data element from an
+	RCU-protected data structure, but **before** reclaiming/freeing
 	the data element, in order to wait for the completion of all
 	RCU read-side critical sections that might be referencing that
 	data item.
 
 See checklist.txt for additional rules to follow when using RCU.
-And again, more-typical uses of RCU may be found in listRCU.txt,
-arrayRCU.txt, and NMI-RCU.txt.
+And again, more-typical uses of RCU may be found in :ref:`listRCU.rst
+<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst
+<NMI_rcu_doc>`.
 
+.. _4_whatisRCU:
 
 4.  WHAT IF MY UPDATING THREAD CANNOT BLOCK?
+--------------------------------------------
 
 In the example above, foo_update_a() blocks until a grace period elapses.
 This is quite simple, but in some cases one cannot afford to wait so
 long -- there might be other high-priority work to be done.
 
 In such cases, one uses call_rcu() rather than synchronize_rcu().
-The call_rcu() API is as follows:
+The call_rcu() API is as follows::
 
 	void call_rcu(struct rcu_head * head,
 		      void (*func)(struct rcu_head *head));
@@ -481,7 +503,7 @@ The call_rcu() API is as follows:
 This function invokes func(head) after a grace period has elapsed.
 This invocation might happen from either softirq or process context,
 so the function is not permitted to block.  The foo struct needs to
-have an rcu_head structure added, perhaps as follows:
+have an rcu_head structure added, perhaps as follows::
 
 	struct foo {
 		int a;
@@ -490,7 +512,7 @@ have an rcu_head structure added, perhaps as follows:
 		struct rcu_head rcu;
 	};
 
-The foo_update_a() function might then be written as follows:
+The foo_update_a() function might then be written as follows::
 
 	/*
 	 * Create a new struct foo that is the same as the one currently
@@ -520,7 +542,7 @@ The foo_update_a() function might then be written as follows:
 		call_rcu(&old_fp->rcu, foo_reclaim);
 	}
 
-The foo_reclaim() function might appear as follows:
+The foo_reclaim() function might appear as follows::
 
 	void foo_reclaim(struct rcu_head *rp)
 	{
@@ -544,7 +566,7 @@ namely foo_reclaim().
 The summary of advice is the same as for the previous section, except
 that we are now using call_rcu() rather than synchronize_rcu():
 
-o	Use call_rcu() -after- removing a data element from an
+-	Use call_rcu() **after** removing a data element from an
 	RCU-protected data structure in order to register a callback
 	function that will be invoked after the completion of all RCU
 	read-side critical sections that might be referencing that
@@ -552,14 +574,16 @@ o	Use call_rcu() -after- removing a data element from an
 
 If the callback for call_rcu() is not doing anything more than calling
 kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
-to avoid having to write your own callback:
+to avoid having to write your own callback::
 
 	kfree_rcu(old_fp, rcu);
 
 Again, see checklist.txt for additional rules governing the use of RCU.
 
+.. _5_whatisRCU:
 
 5.  WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
+------------------------------------------------
 
 One of the nice things about RCU is that it has extremely simple "toy"
 implementations that are a good first step towards understanding the
@@ -579,7 +603,7 @@ more details on the current implementation as of early 2004.
 
 
 5A.  "TOY" IMPLEMENTATION #1: LOCKING
-
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 This section presents a "toy" RCU implementation that is based on
 familiar locking primitives.  Its overhead makes it a non-starter for
 real-life use, as does its lack of scalability.  It is also unsuitable
@@ -591,7 +615,7 @@ you allow nested rcu_read_lock() calls, you can deadlock.
 However, it is probably the easiest implementation to relate to, so is
 a good starting point.
 
-It is extremely simple:
+It is extremely simple::
 
 	static DEFINE_RWLOCK(rcu_gp_mutex);
 
@@ -614,7 +638,7 @@ It is extremely simple:
 
 [You can ignore rcu_assign_pointer() and rcu_dereference() without missing
 much.  But here are simplified versions anyway.  And whatever you do,
-don't forget about them when submitting patches making use of RCU!]
+don't forget about them when submitting patches making use of RCU!]::
 
 	#define rcu_assign_pointer(p, v) \
 	({ \
@@ -647,18 +671,23 @@ that the only thing that can block rcu_read_lock() is a synchronize_rcu().
 But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
 so there can be no deadlock cycle.
 
-Quick Quiz #1:	Why is this argument naive?  How could a deadlock
+.. _quiz_1:
+
+Quick Quiz #1:
+		Why is this argument naive?  How could a deadlock
 		occur when using this algorithm in a real-world Linux
 		kernel?  How could this deadlock be avoided?
 
+:ref:`Answers to Quick Quiz <8_whatisRCU>`
 
 5B.  "TOY" EXAMPLE #2: CLASSIC RCU
-
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 This section presents a "toy" RCU implementation that is based on
 "classic RCU".  It is also short on performance (but only for updates) and
 on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
 kernels.  The definitions of rcu_dereference() and rcu_assign_pointer()
 are the same as those shown in the preceding section, so they are omitted.
+::
 
 	void rcu_read_lock(void) { }
 
@@ -683,14 +712,14 @@ CPU in turn.  The run_on() primitive can be implemented straightforwardly
 in terms of the sched_setaffinity() primitive.  Of course, a somewhat less
 "toy" implementation would restore the affinity upon completion rather
 than just leaving all tasks running on the last CPU, but when I said
-"toy", I meant -toy-!
+"toy", I meant **toy**!
 
 So how the heck is this supposed to work???
 
 Remember that it is illegal to block while in an RCU read-side critical
 section.  Therefore, if a given CPU executes a context switch, we know
 that it must have completed all preceding RCU read-side critical sections.
-Once -all- CPUs have executed a context switch, then -all- preceding
+Once **all** CPUs have executed a context switch, then **all** preceding
 RCU read-side critical sections will have completed.
 
 So, suppose that we remove a data item from its structure and then invoke
@@ -698,19 +727,32 @@ synchronize_rcu().  Once synchronize_rcu() returns, we are guaranteed
 that there are no RCU read-side critical sections holding a reference
 to that data item, so we can safely reclaim it.
 
-Quick Quiz #2:	Give an example where Classic RCU's read-side
-		overhead is -negative-.
+.. _quiz_2:
 
-Quick Quiz #3:  If it is illegal to block in an RCU read-side
+Quick Quiz #2:
+		Give an example where Classic RCU's read-side
+		overhead is **negative**.
+
+:ref:`Answers to Quick Quiz <8_whatisRCU>`
+
+.. _quiz_3:
+
+Quick Quiz #3:
+		If it is illegal to block in an RCU read-side
 		critical section, what the heck do you do in
 		PREEMPT_RT, where normal spinlocks can block???
 
+:ref:`Answers to Quick Quiz <8_whatisRCU>`
+
+.. _6_whatisRCU:
 
 6.  ANALOGY WITH READER-WRITER LOCKING
+--------------------------------------
 
 Although RCU can be used in many different ways, a very common use of
 RCU is analogous to reader-writer locking.  The following unified
 diff shows how closely related RCU and reader-writer locking can be.
+::
 
 	@@ -5,5 +5,5 @@ struct el {
 	 	int data;
@@ -762,7 +804,7 @@ diff shows how closely related RCU and reader-writer locking can be.
 		return 0;
 	 }
 
-Or, for those who prefer a side-by-side listing:
+Or, for those who prefer a side-by-side listing::
 
  1 struct el {                          1 struct el {
  2   struct list_head list;             2   struct list_head list;
@@ -774,40 +816,44 @@ Or, for those who prefer a side-by-side listing:
  8 rwlock_t listmutex;                  8 spinlock_t listmutex;
  9 struct el head;                      9 struct el head;
 
- 1 int search(long key, int *result)    1 int search(long key, int *result)
- 2 {                                    2 {
- 3   struct list_head *lp;              3   struct list_head *lp;
- 4   struct el *p;                      4   struct el *p;
- 5                                      5
- 6   read_lock(&listmutex);             6   rcu_read_lock();
- 7   list_for_each_entry(p, head, lp) { 7   list_for_each_entry_rcu(p, head, lp) {
- 8     if (p->key == key) {             8     if (p->key == key) {
- 9       *result = p->data;             9       *result = p->data;
-10       read_unlock(&listmutex);      10       rcu_read_unlock();
-11       return 1;                     11       return 1;
-12     }                               12     }
-13   }                                 13   }
-14   read_unlock(&listmutex);          14   rcu_read_unlock();
-15   return 0;                         15   return 0;
-16 }                                   16 }
+::
 
- 1 int delete(long key)                 1 int delete(long key)
- 2 {                                    2 {
- 3   struct el *p;                      3   struct el *p;
- 4                                      4
- 5   write_lock(&listmutex);            5   spin_lock(&listmutex);
- 6   list_for_each_entry(p, head, lp) { 6   list_for_each_entry(p, head, lp) {
- 7     if (p->key == key) {             7     if (p->key == key) {
- 8       list_del(&p->list);            8       list_del_rcu(&p->list);
- 9       write_unlock(&listmutex);      9       spin_unlock(&listmutex);
-                                       10       synchronize_rcu();
-10       kfree(p);                     11       kfree(p);
-11       return 1;                     12       return 1;
-12     }                               13     }
-13   }                                 14   }
-14   write_unlock(&listmutex);         15   spin_unlock(&listmutex);
-15   return 0;                         16   return 0;
-16 }                                   17 }
+  1 int search(long key, int *result)    1 int search(long key, int *result)
+  2 {                                    2 {
+  3   struct list_head *lp;              3   struct list_head *lp;
+  4   struct el *p;                      4   struct el *p;
+  5                                      5
+  6   read_lock(&listmutex);             6   rcu_read_lock();
+  7   list_for_each_entry(p, head, lp) { 7   list_for_each_entry_rcu(p, head, lp) {
+  8     if (p->key == key) {             8     if (p->key == key) {
+  9       *result = p->data;             9       *result = p->data;
+ 10       read_unlock(&listmutex);      10       rcu_read_unlock();
+ 11       return 1;                     11       return 1;
+ 12     }                               12     }
+ 13   }                                 13   }
+ 14   read_unlock(&listmutex);          14   rcu_read_unlock();
+ 15   return 0;                         15   return 0;
+ 16 }                                   16 }
+
+::
+
+  1 int delete(long key)                 1 int delete(long key)
+  2 {                                    2 {
+  3   struct el *p;                      3   struct el *p;
+  4                                      4
+  5   write_lock(&listmutex);            5   spin_lock(&listmutex);
+  6   list_for_each_entry(p, head, lp) { 6   list_for_each_entry(p, head, lp) {
+  7     if (p->key == key) {             7     if (p->key == key) {
+  8       list_del(&p->list);            8       list_del_rcu(&p->list);
+  9       write_unlock(&listmutex);      9       spin_unlock(&listmutex);
+                                        10       synchronize_rcu();
+ 10       kfree(p);                     11       kfree(p);
+ 11       return 1;                     12       return 1;
+ 12     }                               13     }
+ 13   }                                 14   }
+ 14   write_unlock(&listmutex);         15   spin_unlock(&listmutex);
+ 15   return 0;                         16   return 0;
+ 16 }                                   17 }
 
 Either way, the differences are quite small.  Read-side locking moves
 to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
@@ -825,15 +871,17 @@ delete() can now block.  If this is a problem, there is a callback-based
 mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
 be used in place of synchronize_rcu().
 
+.. _7_whatisRCU:
 
 7.  FULL LIST OF RCU APIs
+-------------------------
 
 The RCU APIs are documented in docbook-format header comments in the
 Linux-kernel source code, but it helps to have a full list of the
 APIs, since there does not appear to be a way to categorize them
 in docbook.  Here is the list, by category.
 
-RCU list traversal:
+RCU list traversal::
 
 	list_entry_rcu
 	list_first_entry_rcu
@@ -854,7 +902,7 @@ RCU list traversal:
 	hlist_bl_first_rcu
 	hlist_bl_for_each_entry_rcu
 
-RCU pointer/list update:
+RCU pointer/list udate::
 
 	rcu_assign_pointer
 	list_add_rcu
@@ -876,7 +924,9 @@ RCU pointer/list update:
 	hlist_bl_del_rcu
 	hlist_bl_set_first_rcu
 
-RCU:	Critical sections	Grace period		Barrier
+RCU::
+
+	Critical sections	Grace period		Barrier
 
 	rcu_read_lock		synchronize_net		rcu_barrier
 	rcu_read_unlock		synchronize_rcu
@@ -885,7 +935,9 @@ RCU:	Critical sections	Grace period		Barrier
 	rcu_dereference_check	kfree_rcu
 	rcu_dereference_protected
 
-bh:	Critical sections	Grace period		Barrier
+bh::
+
+	Critical sections	Grace period		Barrier
 
 	rcu_read_lock_bh	call_rcu		rcu_barrier
 	rcu_read_unlock_bh	synchronize_rcu
@@ -896,7 +948,9 @@ bh:	Critical sections	Grace period		Barrier
 	rcu_dereference_bh_protected
 	rcu_read_lock_bh_held
 
-sched:	Critical sections	Grace period		Barrier
+sched::
+
+	Critical sections	Grace period		Barrier
 
 	rcu_read_lock_sched	call_rcu		rcu_barrier
 	rcu_read_unlock_sched	synchronize_rcu
@@ -910,7 +964,9 @@ sched:	Critical sections	Grace period		Barrier
 	rcu_read_lock_sched_held
 
 
-SRCU:	Critical sections	Grace period		Barrier
+SRCU::
+
+	Critical sections	Grace period		Barrier
 
 	srcu_read_lock		call_srcu		srcu_barrier
 	srcu_read_unlock	synchronize_srcu
@@ -918,13 +974,14 @@ SRCU:	Critical sections	Grace period		Barrier
 	srcu_dereference_check
 	srcu_read_lock_held
 
-SRCU:	Initialization/cleanup
+SRCU: Initialization/cleanup::
+
 	DEFINE_SRCU
 	DEFINE_STATIC_SRCU
 	init_srcu_struct
 	cleanup_srcu_struct
 
-All:  lockdep-checked RCU-protected pointer access
+All: lockdep-checked RCU-protected pointer access::
 
 	rcu_access_pointer
 	rcu_dereference_raw
@@ -974,15 +1031,19 @@ g.	Otherwise, use RCU.
 Of course, this all assumes that you have determined that RCU is in fact
 the right tool for your job.
 
+.. _8_whatisRCU:
 
 8.  ANSWERS TO QUICK QUIZZES
+----------------------------
 
-Quick Quiz #1:	Why is this argument naive?  How could a deadlock
+Quick Quiz #1:
+		Why is this argument naive?  How could a deadlock
 		occur when using this algorithm in a real-world Linux
 		kernel?  [Referring to the lock-based "toy" RCU
 		algorithm.]
 
-Answer:		Consider the following sequence of events:
+Answer:
+		Consider the following sequence of events:
 
 		1.	CPU 0 acquires some unrelated lock, call it
 			"problematic_lock", disabling irq via
@@ -1021,10 +1082,14 @@ Answer:		Consider the following sequence of events:
 		approach where tasks in RCU read-side critical sections
 		cannot be blocked by tasks executing synchronize_rcu().
 
-Quick Quiz #2:	Give an example where Classic RCU's read-side
-		overhead is -negative-.
+:ref:`Back to Quick Quiz #1 <quiz_1>`
 
-Answer:		Imagine a single-CPU system with a non-CONFIG_PREEMPT
+Quick Quiz #2:
+		Give an example where Classic RCU's read-side
+		overhead is **negative**.
+
+Answer:
+		Imagine a single-CPU system with a non-CONFIG_PREEMPT
 		kernel where a routing table is used by process-context
 		code, but can be updated by irq-context code (for example,
 		by an "ICMP REDIRECT" packet).	The usual way of handling
@@ -1046,11 +1111,15 @@ Answer:		Imagine a single-CPU system with a non-CONFIG_PREEMPT
 		even the theoretical possibility of negative overhead for
 		a synchronization primitive is a bit unexpected.  ;-)
 
-Quick Quiz #3:  If it is illegal to block in an RCU read-side
+:ref:`Back to Quick Quiz #2 <quiz_2>`
+
+Quick Quiz #3:
+		If it is illegal to block in an RCU read-side
 		critical section, what the heck do you do in
 		PREEMPT_RT, where normal spinlocks can block???
 
-Answer:		Just as PREEMPT_RT permits preemption of spinlock
+Answer:
+		Just as PREEMPT_RT permits preemption of spinlock
 		critical sections, it permits preemption of RCU
 		read-side critical sections.  It also permits
 		spinlocks blocking while in RCU read-side critical
@@ -1069,6 +1138,7 @@ Answer:		Just as PREEMPT_RT permits preemption of spinlock
 		Besides, how does the computer know what pizza parlor
 		the human being went to???
 
+:ref:`Back to Quick Quiz #3 <quiz_3>`
 
 ACKNOWLEDGEMENTS
 

From b00aedf978aa5c9a3c2d734fda5e51acfbceb5d6 Mon Sep 17 00:00:00 2001
From: Amol Grover <frextrite@gmail.com>
Date: Sat, 2 Nov 2019 13:31:07 +0530
Subject: [PATCH 33/69] doc: Convert to rcu_dereference.txt to
 rcu_dereference.rst

This patch converts rcu_dereference.txt to rcu_dereference.rst and
adds it to index.rst

Signed-off-by: Amol Grover <frextrite@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst                   |  1 +
 ...cu_dereference.txt => rcu_dereference.rst} | 75 ++++++++++---------
 2 files changed, 42 insertions(+), 34 deletions(-)
 rename Documentation/RCU/{rcu_dereference.txt => rcu_dereference.rst} (88%)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index b9b11481c727..c81d0e4fd999 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -8,6 +8,7 @@ RCU concepts
    :maxdepth: 3
 
    arrayRCU
+   rcu_dereference
    whatisRCU
    rcu
    listRCU
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.rst
similarity index 88%
rename from Documentation/RCU/rcu_dereference.txt
rename to Documentation/RCU/rcu_dereference.rst
index bf699e8cfc75..c9667eb0d444 100644
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.rst
@@ -1,4 +1,7 @@
+.. _rcu_dereference_doc:
+
 PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
+===============================================================
 
 Most of the time, you can use values from rcu_dereference() or one of
 the similar primitives without worries.  Dereferencing (prefix "*"),
@@ -8,7 +11,7 @@ subtraction of constants, and casts all work quite naturally and safely.
 It is nevertheless possible to get into trouble with other operations.
 Follow these rules to keep your RCU code working properly:
 
-o	You must use one of the rcu_dereference() family of primitives
+-	You must use one of the rcu_dereference() family of primitives
 	to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
 	will complain.  Worse yet, your code can see random memory-corruption
 	bugs due to games that compilers and DEC Alpha can play.
@@ -25,24 +28,24 @@ o	You must use one of the rcu_dereference() family of primitives
 	for an example where the compiler can in fact deduce the exact
 	value of the pointer, and thus cause misordering.
 
-o	You are only permitted to use rcu_dereference on pointer values.
+-	You are only permitted to use rcu_dereference on pointer values.
 	The compiler simply knows too much about integral values to
 	trust it to carry dependencies through integer operations.
 	There are a very few exceptions, namely that you can temporarily
 	cast the pointer to uintptr_t in order to:
 
-	o	Set bits and clear bits down in the must-be-zero low-order
+	-	Set bits and clear bits down in the must-be-zero low-order
 		bits of that pointer.  This clearly means that the pointer
 		must have alignment constraints, for example, this does
 		-not- work in general for char* pointers.
 
-	o	XOR bits to translate pointers, as is done in some
+	-	XOR bits to translate pointers, as is done in some
 		classic buddy-allocator algorithms.
 
 	It is important to cast the value back to pointer before
 	doing much of anything else with it.
 
-o	Avoid cancellation when using the "+" and "-" infix arithmetic
+-	Avoid cancellation when using the "+" and "-" infix arithmetic
 	operators.  For example, for a given variable "x", avoid
 	"(x-(uintptr_t)x)" for char* pointers.	The compiler is within its
 	rights to substitute zero for this sort of expression, so that
@@ -54,16 +57,16 @@ o	Avoid cancellation when using the "+" and "-" infix arithmetic
 	"p+a-b" is safe because its value still necessarily depends on
 	the rcu_dereference(), thus maintaining proper ordering.
 
-o	If you are using RCU to protect JITed functions, so that the
+-	If you are using RCU to protect JITed functions, so that the
 	"()" function-invocation operator is applied to a value obtained
 	(directly or indirectly) from rcu_dereference(), you may need to
 	interact directly with the hardware to flush instruction caches.
 	This issue arises on some systems when a newly JITed function is
 	using the same memory that was used by an earlier JITed function.
 
-o	Do not use the results from relational operators ("==", "!=",
+-	Do not use the results from relational operators ("==", "!=",
 	">", ">=", "<", or "<=") when dereferencing.  For example,
-	the following (quite strange) code is buggy:
+	the following (quite strange) code is buggy::
 
 		int *p;
 		int *q;
@@ -81,11 +84,11 @@ o	Do not use the results from relational operators ("==", "!=",
 	after such branches, but can speculate loads, which can again
 	result in misordering bugs.
 
-o	Be very careful about comparing pointers obtained from
+-	Be very careful about comparing pointers obtained from
 	rcu_dereference() against non-NULL values.  As Linus Torvalds
 	explained, if the two pointers are equal, the compiler could
 	substitute the pointer you are comparing against for the pointer
-	obtained from rcu_dereference().  For example:
+	obtained from rcu_dereference().  For example::
 
 		p = rcu_dereference(gp);
 		if (p == &default_struct)
@@ -93,7 +96,7 @@ o	Be very careful about comparing pointers obtained from
 
 	Because the compiler now knows that the value of "p" is exactly
 	the address of the variable "default_struct", it is free to
-	transform this code into the following:
+	transform this code into the following::
 
 		p = rcu_dereference(gp);
 		if (p == &default_struct)
@@ -105,14 +108,14 @@ o	Be very careful about comparing pointers obtained from
 
 	However, comparisons are OK in the following cases:
 
-	o	The comparison was against the NULL pointer.  If the
+	-	The comparison was against the NULL pointer.  If the
 		compiler knows that the pointer is NULL, you had better
 		not be dereferencing it anyway.  If the comparison is
 		non-equal, the compiler is none the wiser.  Therefore,
 		it is safe to compare pointers from rcu_dereference()
 		against NULL pointers.
 
-	o	The pointer is never dereferenced after being compared.
+	-	The pointer is never dereferenced after being compared.
 		Since there are no subsequent dereferences, the compiler
 		cannot use anything it learned from the comparison
 		to reorder the non-existent subsequent dereferences.
@@ -124,31 +127,31 @@ o	Be very careful about comparing pointers obtained from
 		dereferenced, rcu_access_pointer() should be used in place
 		of rcu_dereference().
 
-	o	The comparison is against a pointer that references memory
+	-	The comparison is against a pointer that references memory
 		that was initialized "a long time ago."  The reason
 		this is safe is that even if misordering occurs, the
 		misordering will not affect the accesses that follow
 		the comparison.  So exactly how long ago is "a long
 		time ago"?  Here are some possibilities:
 
-		o	Compile time.
+		-	Compile time.
 
-		o	Boot time.
+		-	Boot time.
 
-		o	Module-init time for module code.
+		-	Module-init time for module code.
 
-		o	Prior to kthread creation for kthread code.
+		-	Prior to kthread creation for kthread code.
 
-		o	During some prior acquisition of the lock that
+		-	During some prior acquisition of the lock that
 			we now hold.
 
-		o	Before mod_timer() time for a timer handler.
+		-	Before mod_timer() time for a timer handler.
 
 		There are many other possibilities involving the Linux
 		kernel's wide array of primitives that cause code to
 		be invoked at a later time.
 
-	o	The pointer being compared against also came from
+	-	The pointer being compared against also came from
 		rcu_dereference().  In this case, both pointers depend
 		on one rcu_dereference() or another, so you get proper
 		ordering either way.
@@ -159,13 +162,13 @@ o	Be very careful about comparing pointers obtained from
 		of such an RCU usage bug is shown in the section titled
 		"EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
 
-	o	All of the accesses following the comparison are stores,
+	-	All of the accesses following the comparison are stores,
 		so that a control dependency preserves the needed ordering.
 		That said, it is easy to get control dependencies wrong.
 		Please see the "CONTROL DEPENDENCIES" section of
 		Documentation/memory-barriers.txt for more details.
 
-	o	The pointers are not equal -and- the compiler does
+	-	The pointers are not equal -and- the compiler does
 		not have enough information to deduce the value of the
 		pointer.  Note that the volatile cast in rcu_dereference()
 		will normally prevent the compiler from knowing too much.
@@ -175,7 +178,7 @@ o	Be very careful about comparing pointers obtained from
 		comparison will provide exactly the information that the
 		compiler needs to deduce the value of the pointer.
 
-o	Disable any value-speculation optimizations that your compiler
+-	Disable any value-speculation optimizations that your compiler
 	might provide, especially if you are making use of feedback-based
 	optimizations that take data collected from prior runs.  Such
 	value-speculation optimizations reorder operations by design.
@@ -188,11 +191,12 @@ o	Disable any value-speculation optimizations that your compiler
 
 
 EXAMPLE OF AMPLIFIED RCU-USAGE BUG
+----------------------------------
 
 Because updaters can run concurrently with RCU readers, RCU readers can
 see stale and/or inconsistent values.  If RCU readers need fresh or
 consistent values, which they sometimes do, they need to take proper
-precautions.  To see this, consider the following code fragment:
+precautions.  To see this, consider the following code fragment::
 
 	struct foo {
 		int a;
@@ -244,7 +248,7 @@ to some reordering from the compiler and CPUs is beside the point.
 
 But suppose that the reader needs a consistent view?
 
-Then one approach is to use locking, for example, as follows:
+Then one approach is to use locking, for example, as follows::
 
 	struct foo {
 		int a;
@@ -299,6 +303,7 @@ As always, use the right tool for the job!
 
 
 EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
+-----------------------------------------
 
 If a pointer obtained from rcu_dereference() compares not-equal to some
 other pointer, the compiler normally has no clue what the value of the
@@ -308,7 +313,7 @@ guarantees that RCU depends on.  And the volatile cast in rcu_dereference()
 should prevent the compiler from guessing the value.
 
 But without rcu_dereference(), the compiler knows more than you might
-expect.  Consider the following code fragment:
+expect.  Consider the following code fragment::
 
 	struct foo {
 		int a;
@@ -354,6 +359,7 @@ dereference the resulting pointer.
 
 
 WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
+------------------------------------------------------------
 
 First, please avoid using rcu_dereference_raw() and also please avoid
 using rcu_dereference_check() and rcu_dereference_protected() with a
@@ -370,7 +376,7 @@ member of the rcu_dereference() to use in various situations:
 
 2.	If the access might be within an RCU read-side critical section
 	on the one hand, or protected by (say) my_lock on the other,
-	use rcu_dereference_check(), for example:
+	use rcu_dereference_check(), for example::
 
 		p1 = rcu_dereference_check(p->rcu_protected_pointer,
 					   lockdep_is_held(&my_lock));
@@ -378,14 +384,14 @@ member of the rcu_dereference() to use in various situations:
 
 3.	If the access might be within an RCU read-side critical section
 	on the one hand, or protected by either my_lock or your_lock on
-	the other, again use rcu_dereference_check(), for example:
+	the other, again use rcu_dereference_check(), for example::
 
 		p1 = rcu_dereference_check(p->rcu_protected_pointer,
 					   lockdep_is_held(&my_lock) ||
 					   lockdep_is_held(&your_lock));
 
 4.	If the access is on the update side, so that it is always protected
-	by my_lock, use rcu_dereference_protected():
+	by my_lock, use rcu_dereference_protected()::
 
 		p1 = rcu_dereference_protected(p->rcu_protected_pointer,
 					       lockdep_is_held(&my_lock));
@@ -410,18 +416,19 @@ member of the rcu_dereference() to use in various situations:
 
 
 SPARSE CHECKING OF RCU-PROTECTED POINTERS
+-----------------------------------------
 
 The sparse static-analysis tool checks for direct access to RCU-protected
 pointers, which can result in "interesting" bugs due to compiler
 optimizations involving invented loads and perhaps also load tearing.
-For example, suppose someone mistakenly does something like this:
+For example, suppose someone mistakenly does something like this::
 
 	p = q->rcu_protected_pointer;
 	do_something_with(p->a);
 	do_something_else_with(p->b);
 
 If register pressure is high, the compiler might optimize "p" out
-of existence, transforming the code to something like this:
+of existence, transforming the code to something like this::
 
 	do_something_with(q->rcu_protected_pointer->a);
 	do_something_else_with(q->rcu_protected_pointer->b);
@@ -435,7 +442,7 @@ Load tearing could of course result in dereferencing a mashup of a pair
 of pointers, which also might fatally disappoint your code.
 
 These problems could have been avoided simply by making the code instead
-read as follows:
+read as follows::
 
 	p = rcu_dereference(q->rcu_protected_pointer);
 	do_something_with(p->a);
@@ -448,7 +455,7 @@ or as a formal parameter, with "__rcu", which tells sparse to complain if
 this pointer is accessed directly.  It will also cause sparse to complain
 if a pointer not marked with "__rcu" is accessed using rcu_dereference()
 and friends.  For example, ->rcu_protected_pointer might be declared as
-follows:
+follows::
 
 	struct foo __rcu *rcu_protected_pointer;
 

From 4af498306ffd8e29ed5c1ae544d01bc8c09c3f8e Mon Sep 17 00:00:00 2001
From: Amol Grover <frextrite@gmail.com>
Date: Thu, 7 Nov 2019 12:02:41 +0530
Subject: [PATCH 34/69] doc: Convert to rcubarrier.txt to ReST

Convert rcubarrier.txt to rcubarrier.rst and add it to index.rst.

Format file according to reST
- Add headings and sub-headings
- Add code segments
- Add cross-references to quizes and answers

Signed-off-by: Amol Grover <frextrite@gmail.com>
Tested-by: Phong Tran <tranmanphong@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst                   |   1 +
 .../RCU/{rcubarrier.txt => rcubarrier.rst}    | 222 ++++++++++--------
 2 files changed, 126 insertions(+), 97 deletions(-)
 rename Documentation/RCU/{rcubarrier.txt => rcubarrier.rst} (72%)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index c81d0e4fd999..81a0a1e5f767 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -8,6 +8,7 @@ RCU concepts
    :maxdepth: 3
 
    arrayRCU
+   rcubarrier
    rcu_dereference
    whatisRCU
    rcu
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.rst
similarity index 72%
rename from Documentation/RCU/rcubarrier.txt
rename to Documentation/RCU/rcubarrier.rst
index a2782df69732..f64f4413a47c 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.rst
@@ -1,4 +1,7 @@
+.. _rcu_barrier:
+
 RCU and Unloadable Modules
+==========================
 
 [Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
 
@@ -21,7 +24,7 @@ given that readers might well leave absolutely no trace of their
 presence? There is a synchronize_rcu() primitive that blocks until all
 pre-existing readers have completed. An updater wishing to delete an
 element p from a linked list might do the following, while holding an
-appropriate lock, of course:
+appropriate lock, of course::
 
 	list_del_rcu(p);
 	synchronize_rcu();
@@ -32,13 +35,13 @@ primitive must be used instead. This primitive takes a pointer to an
 rcu_head struct placed within the RCU-protected data structure and
 another pointer to a function that may be invoked later to free that
 structure. Code to delete an element p from the linked list from IRQ
-context might then be as follows:
+context might then be as follows::
 
 	list_del_rcu(p);
 	call_rcu(&p->rcu, p_callback);
 
 Since call_rcu() never blocks, this code can safely be used from within
-IRQ context. The function p_callback() might be defined as follows:
+IRQ context. The function p_callback() might be defined as follows::
 
 	static void p_callback(struct rcu_head *rp)
 	{
@@ -49,6 +52,7 @@ IRQ context. The function p_callback() might be defined as follows:
 
 
 Unloading Modules That Use call_rcu()
+-------------------------------------
 
 But what if p_callback is defined in an unloadable module?
 
@@ -69,10 +73,11 @@ in realtime kernels in order to avoid excessive scheduling latencies.
 
 
 rcu_barrier()
+-------------
 
 We instead need the rcu_barrier() primitive.  Rather than waiting for
 a grace period to elapse, rcu_barrier() waits for all outstanding RCU
-callbacks to complete.  Please note that rcu_barrier() does -not- imply
+callbacks to complete.  Please note that rcu_barrier() does **not** imply
 synchronize_rcu(), in particular, if there are no RCU callbacks queued
 anywhere, rcu_barrier() is within its rights to return immediately,
 without waiting for a grace period to elapse.
@@ -88,79 +93,79 @@ must match the flavor of rcu_barrier() with that of call_rcu().  If your
 module uses multiple flavors of call_rcu(), then it must also use multiple
 flavors of rcu_barrier() when unloading that module.  For example, if
 it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
-srcu_struct_2(), then the following three lines of code will be required
-when unloading:
+srcu_struct_2, then the following three lines of code will be required
+when unloading::
 
  1 rcu_barrier();
  2 srcu_barrier(&srcu_struct_1);
  3 srcu_barrier(&srcu_struct_2);
 
 The rcutorture module makes use of rcu_barrier() in its exit function
-as follows:
+as follows::
 
- 1 static void
- 2 rcu_torture_cleanup(void)
- 3 {
- 4   int i;
+ 1  static void
+ 2  rcu_torture_cleanup(void)
+ 3  {
+ 4    int i;
  5
- 6   fullstop = 1;
- 7   if (shuffler_task != NULL) {
+ 6    fullstop = 1;
+ 7    if (shuffler_task != NULL) {
  8     VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
  9     kthread_stop(shuffler_task);
-10   }
-11   shuffler_task = NULL;
-12
-13   if (writer_task != NULL) {
-14     VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
-15     kthread_stop(writer_task);
-16   }
-17   writer_task = NULL;
-18
-19   if (reader_tasks != NULL) {
-20     for (i = 0; i < nrealreaders; i++) {
-21       if (reader_tasks[i] != NULL) {
-22         VERBOSE_PRINTK_STRING(
-23           "Stopping rcu_torture_reader task");
-24         kthread_stop(reader_tasks[i]);
-25       }
-26       reader_tasks[i] = NULL;
-27     }
-28     kfree(reader_tasks);
-29     reader_tasks = NULL;
-30   }
-31   rcu_torture_current = NULL;
-32
-33   if (fakewriter_tasks != NULL) {
-34     for (i = 0; i < nfakewriters; i++) {
-35       if (fakewriter_tasks[i] != NULL) {
-36         VERBOSE_PRINTK_STRING(
-37           "Stopping rcu_torture_fakewriter task");
-38         kthread_stop(fakewriter_tasks[i]);
-39       }
-40       fakewriter_tasks[i] = NULL;
-41     }
-42     kfree(fakewriter_tasks);
-43     fakewriter_tasks = NULL;
-44   }
-45
-46   if (stats_task != NULL) {
-47     VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
-48     kthread_stop(stats_task);
-49   }
-50   stats_task = NULL;
-51
-52   /* Wait for all RCU callbacks to fire. */
-53   rcu_barrier();
-54
-55   rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
-56
-57   if (cur_ops->cleanup != NULL)
-58     cur_ops->cleanup();
-59   if (atomic_read(&n_rcu_torture_error))
-60     rcu_torture_print_module_parms("End of test: FAILURE");
-61   else
-62     rcu_torture_print_module_parms("End of test: SUCCESS");
-63 }
+ 10   }
+ 11   shuffler_task = NULL;
+ 12
+ 13   if (writer_task != NULL) {
+ 14     VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+ 15     kthread_stop(writer_task);
+ 16   }
+ 17   writer_task = NULL;
+ 18
+ 19   if (reader_tasks != NULL) {
+ 20     for (i = 0; i < nrealreaders; i++) {
+ 21       if (reader_tasks[i] != NULL) {
+ 22         VERBOSE_PRINTK_STRING(
+ 23           "Stopping rcu_torture_reader task");
+ 24         kthread_stop(reader_tasks[i]);
+ 25       }
+ 26       reader_tasks[i] = NULL;
+ 27     }
+ 28     kfree(reader_tasks);
+ 29     reader_tasks = NULL;
+ 30   }
+ 31   rcu_torture_current = NULL;
+ 32
+ 33   if (fakewriter_tasks != NULL) {
+ 34     for (i = 0; i < nfakewriters; i++) {
+ 35       if (fakewriter_tasks[i] != NULL) {
+ 36         VERBOSE_PRINTK_STRING(
+ 37           "Stopping rcu_torture_fakewriter task");
+ 38         kthread_stop(fakewriter_tasks[i]);
+ 39       }
+ 40       fakewriter_tasks[i] = NULL;
+ 41     }
+ 42     kfree(fakewriter_tasks);
+ 43     fakewriter_tasks = NULL;
+ 44   }
+ 45
+ 46   if (stats_task != NULL) {
+ 47     VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+ 48     kthread_stop(stats_task);
+ 49   }
+ 50   stats_task = NULL;
+ 51
+ 52   /* Wait for all RCU callbacks to fire. */
+ 53   rcu_barrier();
+ 54
+ 55   rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+ 56
+ 57   if (cur_ops->cleanup != NULL)
+ 58     cur_ops->cleanup();
+ 59   if (atomic_read(&n_rcu_torture_error))
+ 60     rcu_torture_print_module_parms("End of test: FAILURE");
+ 61   else
+ 62     rcu_torture_print_module_parms("End of test: SUCCESS");
+ 63 }
 
 Line 6 sets a global variable that prevents any RCU callbacks from
 re-posting themselves. This will not be necessary in most cases, since
@@ -176,9 +181,14 @@ for any pre-existing callbacks to complete.
 Then lines 55-62 print status and do operation-specific cleanup, and
 then return, permitting the module-unload operation to be completed.
 
-Quick Quiz #1: Is there any other situation where rcu_barrier() might
+.. _rcubarrier_quiz_1:
+
+Quick Quiz #1:
+	Is there any other situation where rcu_barrier() might
 	be required?
 
+:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>`
+
 Your module might have additional complications. For example, if your
 module invokes call_rcu() from timers, you will need to first cancel all
 the timers, and only then invoke rcu_barrier() to wait for any remaining
@@ -188,11 +198,12 @@ Of course, if you module uses call_rcu(), you will need to invoke
 rcu_barrier() before unloading.  Similarly, if your module uses
 call_srcu(), you will need to invoke srcu_barrier() before unloading,
 and on the same srcu_struct structure.  If your module uses call_rcu()
--and- call_srcu(), then you will need to invoke rcu_barrier() -and-
+**and** call_srcu(), then you will need to invoke rcu_barrier() **and**
 srcu_barrier().
 
 
 Implementing rcu_barrier()
+--------------------------
 
 Dipankar Sarma's implementation of rcu_barrier() makes use of the fact
 that RCU callbacks are never reordered once queued on one of the per-CPU
@@ -200,19 +211,19 @@ queues. His implementation queues an RCU callback on each of the per-CPU
 callback queues, and then waits until they have all started executing, at
 which point, all earlier RCU callbacks are guaranteed to have completed.
 
-The original code for rcu_barrier() was as follows:
+The original code for rcu_barrier() was as follows::
 
- 1 void rcu_barrier(void)
- 2 {
- 3   BUG_ON(in_interrupt());
- 4   /* Take cpucontrol mutex to protect against CPU hotplug */
- 5   mutex_lock(&rcu_barrier_mutex);
- 6   init_completion(&rcu_barrier_completion);
- 7   atomic_set(&rcu_barrier_cpu_count, 0);
- 8   on_each_cpu(rcu_barrier_func, NULL, 0, 1);
- 9   wait_for_completion(&rcu_barrier_completion);
-10   mutex_unlock(&rcu_barrier_mutex);
-11 }
+ 1  void rcu_barrier(void)
+ 2  {
+ 3    BUG_ON(in_interrupt());
+ 4    /* Take cpucontrol mutex to protect against CPU hotplug */
+ 5    mutex_lock(&rcu_barrier_mutex);
+ 6    init_completion(&rcu_barrier_completion);
+ 7    atomic_set(&rcu_barrier_cpu_count, 0);
+ 8    on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+ 9    wait_for_completion(&rcu_barrier_completion);
+ 10   mutex_unlock(&rcu_barrier_mutex);
+ 11 }
 
 Line 3 verifies that the caller is in process context, and lines 5 and 10
 use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
@@ -226,18 +237,18 @@ This code was rewritten in 2008 and several times thereafter, but this
 still gives the general idea.
 
 The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
-to post an RCU callback, as follows:
+to post an RCU callback, as follows::
 
- 1 static void rcu_barrier_func(void *notused)
- 2 {
- 3 int cpu = smp_processor_id();
- 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- 5 struct rcu_head *head;
+ 1  static void rcu_barrier_func(void *notused)
+ 2  {
+ 3    int cpu = smp_processor_id();
+ 4    struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ 5    struct rcu_head *head;
  6
- 7 head = &rdp->barrier;
- 8 atomic_inc(&rcu_barrier_cpu_count);
- 9 call_rcu(head, rcu_barrier_callback);
-10 }
+ 7    head = &rdp->barrier;
+ 8    atomic_inc(&rcu_barrier_cpu_count);
+ 9    call_rcu(head, rcu_barrier_callback);
+ 10 }
 
 Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
 which contains the struct rcu_head that needed for the later call to
@@ -248,20 +259,25 @@ the current CPU's queue.
 
 The rcu_barrier_callback() function simply atomically decrements the
 rcu_barrier_cpu_count variable and finalizes the completion when it
-reaches zero, as follows:
+reaches zero, as follows::
 
  1 static void rcu_barrier_callback(struct rcu_head *notused)
  2 {
- 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- 4 complete(&rcu_barrier_completion);
+ 3   if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ 4     complete(&rcu_barrier_completion);
  5 }
 
-Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
+.. _rcubarrier_quiz_2:
+
+Quick Quiz #2:
+	What happens if CPU 0's rcu_barrier_func() executes
 	immediately (thus incrementing rcu_barrier_cpu_count to the
 	value one), but the other CPU's rcu_barrier_func() invocations
 	are delayed for a full grace period? Couldn't this result in
 	rcu_barrier() returning prematurely?
 
+:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
+
 The current rcu_barrier() implementation is more complex, due to the need
 to avoid disturbing idle CPUs (especially on battery-powered systems)
 and the need to minimally disturb non-idle CPUs in real-time systems.
@@ -269,6 +285,7 @@ However, the code above illustrates the concepts.
 
 
 rcu_barrier() Summary
+---------------------
 
 The rcu_barrier() primitive has seen relatively little use, since most
 code using RCU is in the core kernel rather than in modules. However, if
@@ -277,8 +294,12 @@ so that your module may be safely unloaded.
 
 
 Answers to Quick Quizzes
+------------------------
 
-Quick Quiz #1: Is there any other situation where rcu_barrier() might
+.. _answer_rcubarrier_quiz_1:
+
+Quick Quiz #1:
+	Is there any other situation where rcu_barrier() might
 	be required?
 
 Answer: Interestingly enough, rcu_barrier() was not originally
@@ -292,7 +313,12 @@ Answer: Interestingly enough, rcu_barrier() was not originally
 	implementing rcutorture, and found that rcu_barrier() solves
 	this problem as well.
 
-Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
+:ref:`Back to Quick Quiz #1 <rcubarrier_quiz_1>`
+
+.. _answer_rcubarrier_quiz_2:
+
+Quick Quiz #2:
+	What happens if CPU 0's rcu_barrier_func() executes
 	immediately (thus incrementing rcu_barrier_cpu_count to the
 	value one), but the other CPU's rcu_barrier_func() invocations
 	are delayed for a full grace period? Couldn't this result in
@@ -323,3 +349,5 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last
 	is to add an rcu_read_lock() before line 8 of rcu_barrier()
 	and an rcu_read_unlock() after line 8 of this same function. If
 	you can think of a better change, please let me know!
+
+:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`

From 17f0da13873ba393a72f14d41ffc8ff388e38723 Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Date: Mon, 11 Nov 2019 23:41:22 +0530
Subject: [PATCH 35/69] doc: Updated full list of RCU API in whatisRCU.rst

This patch updates the list of RCU API in whatisRCU.rst.

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Tested-by: Amol Grover <frextrite@gmail.com>
Tested-by: Phong Tran <tranmanphong@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/whatisRCU.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 2f6f6ebbc8b0..c7f147b8034f 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -884,11 +884,14 @@ in docbook.  Here is the list, by category.
 RCU list traversal::
 
 	list_entry_rcu
+	list_entry_lockless
 	list_first_entry_rcu
 	list_next_rcu
 	list_for_each_entry_rcu
 	list_for_each_entry_continue_rcu
 	list_for_each_entry_from_rcu
+	list_first_or_null_rcu
+	list_next_or_null_rcu
 	hlist_first_rcu
 	hlist_next_rcu
 	hlist_pprev_rcu
@@ -902,7 +905,7 @@ RCU list traversal::
 	hlist_bl_first_rcu
 	hlist_bl_for_each_entry_rcu
 
-RCU pointer/list udate::
+RCU pointer/list update::
 
 	rcu_assign_pointer
 	list_add_rcu
@@ -912,10 +915,12 @@ RCU pointer/list udate::
 	hlist_add_behind_rcu
 	hlist_add_before_rcu
 	hlist_add_head_rcu
+	hlist_add_tail_rcu
 	hlist_del_rcu
 	hlist_del_init_rcu
 	hlist_replace_rcu
-	list_splice_init_rcu()
+	list_splice_init_rcu
+	list_splice_tail_init_rcu
 	hlist_nulls_del_init_rcu
 	hlist_nulls_del_rcu
 	hlist_nulls_add_head_rcu

From 6e6eca2ee79a23329093cdf8d1cc0bd86128981f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 13 Nov 2019 09:12:59 -0800
Subject: [PATCH 36/69] doc: Fix typo "deference" to "dereference"

Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/lockdep-splat.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.txt
index 9c015976b174..b8096316fd11 100644
--- a/Documentation/RCU/lockdep-splat.txt
+++ b/Documentation/RCU/lockdep-splat.txt
@@ -99,7 +99,7 @@ With this change, the rcu_dereference() is always within an RCU
 read-side critical section, which again would have suppressed the
 above lockdep-RCU splat.
 
-But in this particular case, we don't actually deference the pointer
+But in this particular case, we don't actually dereference the pointer
 returned from rcu_dereference().  Instead, that pointer is just compared
 to the cic pointer, which means that the rcu_dereference() can be replaced
 by rcu_access_pointer() as follows:

From 1a271ebbfe33a44f61e02d35a2950ab00b32850b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 9 Dec 2019 19:13:45 -0800
Subject: [PATCH 37/69] net/tipc: Replace rcu_swap_protected() with
 rcu_replace_pointer()

This commit replaces the use of rcu_swap_protected() with the more
intuitively appealing rcu_replace_pointer() as a step towards removing
rcu_swap_protected().

Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
[ paulmck: Updated based on Ying Xue and Tuong Lien Tong feedback. ]
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: <netdev@vger.kernel.org>
Cc: <tipc-discussion@lists.sourceforge.net>
---
 net/tipc/crypto.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index 990a872cec46..c8c47fc72653 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -257,9 +257,6 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
 #define tipc_aead_rcu_ptr(rcu_ptr, lock)				\
 	rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock))
 
-#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock)				\
-	rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock))
-
 #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock)			\
 do {									\
 	typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr),	\
@@ -1189,7 +1186,7 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending)
 
 	/* Move passive key if any */
 	if (key.passive) {
-		tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock);
+		tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock));
 		x = (key.passive - key.pending + new_pending) % KEY_MAX;
 		new_passive = (x <= 0) ? x + KEY_MAX : x;
 	}

From a191c9e9f73a78e8801b5eeb3d43bbd6fd73b86f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 11 Dec 2019 10:30:21 -0800
Subject: [PATCH 38/69] wireless/mediatek: Replace rcu_swap_protected() with
 rcu_replace_pointer()

This commit replaces the use of rcu_swap_protected() with the more
intuitively appealing rcu_replace_pointer() as a step towards removing
rcu_swap_protected().

Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: "Martin K. Petersen" <martin.petersen@oracle.com>
[ paulmck: Apply Matthias Brugger feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Acked-by: Kalle Valo <kvalo@codeaurora.org>
Cc: Felix Fietkau <nbd@nbd.name>
Cc: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Cc: Ryder Lee <ryder.lee@mediatek.com>
Cc: Roy Luo <royluo@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: <linux-wireless@vger.kernel.org>
Cc: <netdev@vger.kernel.org>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: <linux-mediatek@lists.infradead.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index 53b5a4b2dcc5..59c187898132 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -281,8 +281,8 @@ void mt76_rx_aggr_stop(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno)
 {
 	struct mt76_rx_tid *tid = NULL;
 
-	rcu_swap_protected(wcid->aggr[tidno], tid,
-			   lockdep_is_held(&dev->mutex));
+	tid = rcu_replace_pointer(wcid->aggr[tidno], tid,
+				  lockdep_is_held(&dev->mutex));
 	if (tid) {
 		mt76_rx_aggr_shutdown(dev, tid);
 		kfree_rcu(tid, rcu_head);

From 4414abf89158d734a83c99f6504f648417bd9550 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 23 Sep 2019 16:31:42 -0700
Subject: [PATCH 39/69] rcu: Remove rcu_swap_protected()

Now that the calls to rcu_swap_protected() have been replaced by
rcu_replace_pointer(), this commit removes rcu_swap_protected().

Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4gg6Hw@mail.gmail.com/
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Shane M Seymour <shane.seymour@hpe.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/rcupdate.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 0b7506330c87..fe470243acdd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -400,22 +400,6 @@ do {									      \
 	__tmp;								\
 })
 
-/**
- * rcu_swap_protected() - swap an RCU and a regular pointer
- * @rcu_ptr: RCU pointer
- * @ptr: regular pointer
- * @c: the conditions under which the dereference will take place
- *
- * Perform swap(@rcu_ptr, @ptr) where @rcu_ptr is an RCU-annotated pointer and
- * @c is the argument that is passed to the rcu_dereference_protected() call
- * used to read that pointer.
- */
-#define rcu_swap_protected(rcu_ptr, ptr, c) do {			\
-	typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));	\
-	rcu_assign_pointer((rcu_ptr), (ptr));				\
-	(ptr) = __tmp;							\
-} while (0)
-
 /**
  * rcu_access_pointer() - fetch RCU pointer with no dereferencing
  * @p: The pointer to read

From c30fe541896440667bd9e9068aedd1d440fbbcd2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 11 Oct 2019 21:40:09 -0700
Subject: [PATCH 40/69] rcu: Mark non-global functions and variables as static

Each of rcu_state, rcu_rnp_online_cpus(), rcu_dynticks_curr_cpu_in_eqs(),
and rcu_dynticks_snap() are used only in the kernel/rcu/tree.o translation
unit, and may thus be marked static.  This commit therefore makes this
change.

Reported-by: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree.c | 8 ++++----
 kernel/rcu/tree.h | 2 --
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1694a6b57ad8..dd8cfc34f4da 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
 	.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
 	.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
 };
-struct rcu_state rcu_state = {
+static struct rcu_state rcu_state = {
 	.level = { &rcu_state.node[0] },
 	.gp_state = RCU_GP_IDLE,
 	.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
@@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
  * held, but the bit corresponding to the current CPU will be stable
  * in most contexts.
  */
-unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
 {
 	return READ_ONCE(rnp->qsmaskinitnext);
 }
@@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void)
  *
  * No ordering, as we are sampling CPU-local information.
  */
-bool rcu_dynticks_curr_cpu_in_eqs(void)
+static bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
@@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
  * Snapshot the ->dynticks counter with full ordering so as to allow
  * stable comparison of this counter with past and future snapshots.
  */
-int rcu_dynticks_snap(struct rcu_data *rdp)
+static int rcu_dynticks_snap(struct rcu_data *rdp)
 {
 	int snap = atomic_add_return(0, &rdp->dynticks);
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 055c31781d3a..e4dc5debfc84 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -403,8 +403,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
 #define RCU_NAME rcu_name
 #endif /* #else #ifdef CONFIG_TRACING */
 
-int rcu_dynticks_snap(struct rcu_data *rdp);
-
 /* Forward declarations for tree_plugin.h */
 static void rcu_bootup_announce(void);
 static void rcu_qs(void);

From 860c8802ace14c646864795e057349c9fb2d60ad Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sat, 9 Nov 2019 09:42:13 -0800
Subject: [PATCH 41/69] rcu: Use WRITE_ONCE() for assignments to ->pprev for
 hlist_nulls

Eric Dumazet supplied a KCSAN report of a bug that forces use
of hlist_unhashed_lockless() from sk_unhashed():

------------------------------------------------------------------------

BUG: KCSAN: data-race in inet_unhash / inet_unhash

write to 0xffff8880a69a0170 of 8 bytes by interrupt on cpu 1:
 __hlist_nulls_del include/linux/list_nulls.h:88 [inline]
 hlist_nulls_del_init_rcu include/linux/rculist_nulls.h:36 [inline]
 __sk_nulls_del_node_init_rcu include/net/sock.h:676 [inline]
 inet_unhash+0x38f/0x4a0 net/ipv4/inet_hashtables.c:612
 tcp_set_state+0xfa/0x3e0 net/ipv4/tcp.c:2249
 tcp_done+0x93/0x1e0 net/ipv4/tcp.c:3854
 tcp_write_err+0x7e/0xc0 net/ipv4/tcp_timer.c:56
 tcp_retransmit_timer+0x9b8/0x16d0 net/ipv4/tcp_timer.c:479
 tcp_write_timer_handler+0x42d/0x510 net/ipv4/tcp_timer.c:599
 tcp_write_timer+0xd1/0xf0 net/ipv4/tcp_timer.c:619
 call_timer_fn+0x5f/0x2f0 kernel/time/timer.c:1404
 expire_timers kernel/time/timer.c:1449 [inline]
 __run_timers kernel/time/timer.c:1773 [inline]
 __run_timers kernel/time/timer.c:1740 [inline]
 run_timer_softirq+0xc0c/0xcd0 kernel/time/timer.c:1786
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 invoke_softirq kernel/softirq.c:373 [inline]
 irq_exit+0xbb/0xe0 kernel/softirq.c:413
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 smp_apic_timer_interrupt+0xe6/0x280 arch/x86/kernel/apic/apic.c:1137
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830
 native_safe_halt+0xe/0x10 arch/x86/kernel/paravirt.c:71
 arch_cpu_idle+0x1f/0x30 arch/x86/kernel/process.c:571
 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94
 cpuidle_idle_call kernel/sched/idle.c:154 [inline]
 do_idle+0x1af/0x280 kernel/sched/idle.c:263
 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355
 start_secondary+0x208/0x260 arch/x86/kernel/smpboot.c:264
 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241

read to 0xffff8880a69a0170 of 8 bytes by interrupt on cpu 0:
 sk_unhashed include/net/sock.h:607 [inline]
 inet_unhash+0x3d/0x4a0 net/ipv4/inet_hashtables.c:592
 tcp_set_state+0xfa/0x3e0 net/ipv4/tcp.c:2249
 tcp_done+0x93/0x1e0 net/ipv4/tcp.c:3854
 tcp_write_err+0x7e/0xc0 net/ipv4/tcp_timer.c:56
 tcp_retransmit_timer+0x9b8/0x16d0 net/ipv4/tcp_timer.c:479
 tcp_write_timer_handler+0x42d/0x510 net/ipv4/tcp_timer.c:599
 tcp_write_timer+0xd1/0xf0 net/ipv4/tcp_timer.c:619
 call_timer_fn+0x5f/0x2f0 kernel/time/timer.c:1404
 expire_timers kernel/time/timer.c:1449 [inline]
 __run_timers kernel/time/timer.c:1773 [inline]
 __run_timers kernel/time/timer.c:1740 [inline]
 run_timer_softirq+0xc0c/0xcd0 kernel/time/timer.c:1786
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 invoke_softirq kernel/softirq.c:373 [inline]
 irq_exit+0xbb/0xe0 kernel/softirq.c:413
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 smp_apic_timer_interrupt+0xe6/0x280 arch/x86/kernel/apic/apic.c:1137
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830
 native_safe_halt+0xe/0x10 arch/x86/kernel/paravirt.c:71
 arch_cpu_idle+0x1f/0x30 arch/x86/kernel/process.c:571
 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94
 cpuidle_idle_call kernel/sched/idle.c:154 [inline]
 do_idle+0x1af/0x280 kernel/sched/idle.c:263
 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355
 rest_init+0xec/0xf6 init/main.c:452
 arch_call_rest_init+0x17/0x37
 start_kernel+0x838/0x85e init/main.c:786
 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490
 x86_64_start_kernel+0x72/0x76 arch/x86/kernel/head64.c:471
 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc6+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine,
BIOS Google 01/01/2011

------------------------------------------------------------------------

This commit therefore replaces C-language assignments with WRITE_ONCE()
in include/linux/list_nulls.h and include/linux/rculist_nulls.h.

Reported-by: Eric Dumazet <edumazet@google.com> # For KCSAN
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/list_nulls.h    | 8 ++++----
 include/linux/rculist_nulls.h | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 3ef96743db8d..1ecd35664e0d 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -72,10 +72,10 @@ static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
 	struct hlist_nulls_node *first = h->first;
 
 	n->next = first;
-	n->pprev = &h->first;
+	WRITE_ONCE(n->pprev, &h->first);
 	h->first = n;
 	if (!is_a_nulls(first))
-		first->pprev = &n->next;
+		WRITE_ONCE(first->pprev, &n->next);
 }
 
 static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
@@ -85,13 +85,13 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
 
 	WRITE_ONCE(*pprev, next);
 	if (!is_a_nulls(next))
-		next->pprev = pprev;
+		WRITE_ONCE(next->pprev, pprev);
 }
 
 static inline void hlist_nulls_del(struct hlist_nulls_node *n)
 {
 	__hlist_nulls_del(n);
-	n->pprev = LIST_POISON2;
+	WRITE_ONCE(n->pprev, LIST_POISON2);
 }
 
 /**
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index bc8206a8f30e..517a06f36c7a 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -34,7 +34,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
 {
 	if (!hlist_nulls_unhashed(n)) {
 		__hlist_nulls_del(n);
-		n->pprev = NULL;
+		WRITE_ONCE(n->pprev, NULL);
 	}
 }
 
@@ -66,7 +66,7 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
 static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
 {
 	__hlist_nulls_del(n);
-	n->pprev = LIST_POISON2;
+	WRITE_ONCE(n->pprev, LIST_POISON2);
 }
 
 /**
@@ -94,10 +94,10 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
 	struct hlist_nulls_node *first = h->first;
 
 	n->next = first;
-	n->pprev = &h->first;
+	WRITE_ONCE(n->pprev, &h->first);
 	rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
 	if (!is_a_nulls(first))
-		first->pprev = &n->next;
+		WRITE_ONCE(first->pprev, &n->next);
 }
 
 /**

From 46deb7449d99f37bebf5cbd7f95c136c6fafeaa5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sat, 9 Nov 2019 10:35:13 -0800
Subject: [PATCH 42/69] rcu: Add and update docbook header comments in list.h

[ paulmck: Fix typo found by kbuild test robot. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/list.h | 112 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 95 insertions(+), 17 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 61f5aaf96192..4f3b7f71bdfd 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -23,6 +23,13 @@
 #define LIST_HEAD(name) \
 	struct list_head name = LIST_HEAD_INIT(name)
 
+/**
+ * INIT_LIST_HEAD - Initialize a list_head structure
+ * @list: list_head structure to be initialized.
+ *
+ * Initializes the list_head to point to itself.  If it is a list header,
+ * the result is an empty list.
+ */
 static inline void INIT_LIST_HEAD(struct list_head *list)
 {
 	WRITE_ONCE(list->next, list);
@@ -120,12 +127,6 @@ static inline void __list_del_clearprev(struct list_head *entry)
 	entry->prev = NULL;
 }
 
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty() on entry does not return true after this, the entry is
- * in an undefined state.
- */
 static inline void __list_del_entry(struct list_head *entry)
 {
 	if (!__list_del_entry_valid(entry))
@@ -134,6 +135,12 @@ static inline void __list_del_entry(struct list_head *entry)
 	__list_del(entry->prev, entry->next);
 }
 
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty() on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
 static inline void list_del(struct list_head *entry)
 {
 	__list_del_entry(entry);
@@ -157,8 +164,15 @@ static inline void list_replace(struct list_head *old,
 	new->prev->next = new;
 }
 
+/**
+ * list_replace_init - replace old entry by new one and initialize the old one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * If @old was empty, it will be overwritten.
+ */
 static inline void list_replace_init(struct list_head *old,
-					struct list_head *new)
+				     struct list_head *new)
 {
 	list_replace(old, new);
 	INIT_LIST_HEAD(old);
@@ -744,21 +758,36 @@ static inline void INIT_HLIST_NODE(struct hlist_node *h)
 	h->pprev = NULL;
 }
 
+/**
+ * hlist_unhashed - Has node been removed from list and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed
+ * state.  For example, hlist_nulls_del_init_rcu() does leave the
+ * node in unhashed state, but hlist_nulls_del() does not.
+ */
 static inline int hlist_unhashed(const struct hlist_node *h)
 {
 	return !h->pprev;
 }
 
-/* This variant of hlist_unhashed() must be used in lockless contexts
- * to avoid potential load-tearing.
- * The READ_ONCE() is paired with the various WRITE_ONCE() in hlist
- * helpers that are defined below.
+/**
+ * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
+ * @h: Node to be checked
+ *
+ * This variant of hlist_unhashed() must be used in lockless contexts
+ * to avoid potential load-tearing.  The READ_ONCE() is paired with the
+ * various WRITE_ONCE() in hlist helpers that are defined below.
  */
 static inline int hlist_unhashed_lockless(const struct hlist_node *h)
 {
 	return !READ_ONCE(h->pprev);
 }
 
+/**
+ * hlist_empty - Is the specified hlist_head structure an empty hlist?
+ * @h: Structure to check.
+ */
 static inline int hlist_empty(const struct hlist_head *h)
 {
 	return !READ_ONCE(h->first);
@@ -774,6 +803,13 @@ static inline void __hlist_del(struct hlist_node *n)
 		WRITE_ONCE(next->pprev, pprev);
 }
 
+/**
+ * hlist_del - Delete the specified hlist_node from its list
+ * @n: Node to delete.
+ *
+ * Note that this function leaves the node in hashed state.  Use
+ * hlist_del_init() or similar instead to unhash @n.
+ */
 static inline void hlist_del(struct hlist_node *n)
 {
 	__hlist_del(n);
@@ -781,6 +817,12 @@ static inline void hlist_del(struct hlist_node *n)
 	n->pprev = LIST_POISON2;
 }
 
+/**
+ * hlist_del_init - Delete the specified hlist_node from its list and initialize
+ * @n: Node to delete.
+ *
+ * Note that this function leaves the node in unhashed state.
+ */
 static inline void hlist_del_init(struct hlist_node *n)
 {
 	if (!hlist_unhashed(n)) {
@@ -789,6 +831,14 @@ static inline void hlist_del_init(struct hlist_node *n)
 	}
 }
 
+/**
+ * hlist_add_head - add a new entry at the beginning of the hlist
+ * @n: new entry to be added
+ * @h: hlist head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
 static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 {
 	struct hlist_node *first = h->first;
@@ -799,9 +849,13 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 	WRITE_ONCE(n->pprev, &h->first);
 }
 
-/* next must be != NULL */
+/**
+ * hlist_add_before - add a new entry before the one specified
+ * @n: new entry to be added
+ * @next: hlist node to add it before, which must be non-NULL
+ */
 static inline void hlist_add_before(struct hlist_node *n,
-					struct hlist_node *next)
+				    struct hlist_node *next)
 {
 	WRITE_ONCE(n->pprev, next->pprev);
 	WRITE_ONCE(n->next, next);
@@ -809,6 +863,11 @@ static inline void hlist_add_before(struct hlist_node *n,
 	WRITE_ONCE(*(n->pprev), n);
 }
 
+/**
+ * hlist_add_behing - add a new entry after the one specified
+ * @n: new entry to be added
+ * @prev: hlist node to add it after, which must be non-NULL
+ */
 static inline void hlist_add_behind(struct hlist_node *n,
 				    struct hlist_node *prev)
 {
@@ -820,20 +879,35 @@ static inline void hlist_add_behind(struct hlist_node *n,
 		WRITE_ONCE(n->next->pprev, &n->next);
 }
 
-/* after that we'll appear to be on some hlist and hlist_del will work */
+/**
+ * hlist_add_fake - create a fake hlist consisting of a single headless node
+ * @n: Node to make a fake list out of
+ *
+ * This makes @n appear to be its own predecessor on a headless hlist.
+ * The point of this is to allow things like hlist_del() to work correctly
+ * in cases where there is no list.
+ */
 static inline void hlist_add_fake(struct hlist_node *n)
 {
 	n->pprev = &n->next;
 }
 
+/**
+ * hlist_fake: Is this node a fake hlist?
+ * @h: Node to check for being a self-referential fake hlist.
+ */
 static inline bool hlist_fake(struct hlist_node *h)
 {
 	return h->pprev == &h->next;
 }
 
-/*
+/**
+ * hlist_is_singular_node - is node the only element of the specified hlist?
+ * @n: Node to check for singularity.
+ * @h: Header for potentially singular list.
+ *
  * Check whether the node is the only node of the head without
- * accessing head:
+ * accessing head, thus avoiding unnecessary cache misses.
  */
 static inline bool
 hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
@@ -841,7 +915,11 @@ hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
 	return !n->next && n->pprev == &h->first;
 }
 
-/*
+/**
+ * hlist_move_list - Move an hlist
+ * @old: hlist_head for old list.
+ * @new: hlist_head for new list.
+ *
  * Move a list from one list head to another. Fixup the pprev
  * reference of the first entry if it exists.
  */

From 02b99b38f3d96c77cf0a368d99952aa372dfe58a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sat, 9 Nov 2019 10:45:47 -0800
Subject: [PATCH 43/69] rcu: Add a hlist_nulls_unhashed_lockless() function

This commit adds an hlist_nulls_unhashed_lockless() to allow lockless
checking for whether or note an hlist_nulls_node is hashed or not.
While in the area, this commit also adds a docbook comment to the existing
hlist_nulls_unhashed() function.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/list_nulls.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 1ecd35664e0d..fa6e8471bd22 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -56,11 +56,33 @@ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
 	return ((unsigned long)ptr) >> 1;
 }
 
+/**
+ * hlist_nulls_unhashed - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.
+ */
 static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
 {
 	return !h->pprev;
 }
 
+/**
+ * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
+ * function may be used locklessly.
+ */
+static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
+{
+	return !READ_ONCE(h->pprev);
+}
+
 static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
 {
 	return is_a_nulls(READ_ONCE(h->first));

From 7f5d51e26a471f771b8dae1b9ef417f5fd5e9c85 Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Date: Thu, 5 Dec 2019 11:46:49 +0530
Subject: [PATCH 44/69] rculist_nulls: Add docbook comments

This patch adds docbook comment headers for hlist_nulls_first_rcu()
and hlist_nulls_next_rcu() in rculist_nulls.h.

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rculist_nulls.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 517a06f36c7a..25952c4f83b0 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -38,9 +38,17 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
 	}
 }
 
+/**
+ * hlist_nulls_first_rcu - returns the first element of the hash list.
+ * @head: the head of the list.
+ */
 #define hlist_nulls_first_rcu(head) \
 	(*((struct hlist_nulls_node __rcu __force **)&(head)->first))
 
+/**
+ * hlist_nulls_next_rcu - returns the element of the list after @node.
+ * @node: element of the list.
+ */
 #define hlist_nulls_next_rcu(node) \
 	(*((struct hlist_nulls_node __rcu __force **)&(node)->next))
 

From 459b5287066f53c4b91569c070780a540de90b85 Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Date: Fri, 6 Dec 2019 00:23:52 +0530
Subject: [PATCH 45/69] rculist_nulls: Change docbook comment headers

This patch changes the docbook comment "head for your list"
to "head of the list".

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rculist_nulls.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 25952c4f83b0..409a86bb5f25 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -112,7 +112,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
  * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
- * @head:	the head for your list.
+ * @head:	the head of the list.
  * @member:	the name of the hlist_nulls_node within the struct.
  *
  * The barrier() is needed to make sure compiler doesn't cache first element [1],
@@ -132,7 +132,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
  *   iterate over list of given type safe against removal of list entry
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
- * @head:	the head for your list.
+ * @head:	the head of the list.
  * @member:	the name of the hlist_nulls_node within the struct.
  */
 #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member)		\

From afa47fdfa29ffd3324e7b89551d1a6e54ccc042b Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
Date: Mon, 9 Dec 2019 13:20:43 +0530
Subject: [PATCH 46/69] rculist.h: Add list_tail_rcu()

This patch adds the macro list_tail_rcu() and documents it.

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik04@gmail.com>
[ paulmck: Reword a bit. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rculist.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4b7ae1bf50b3..9f313e4999fe 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -40,6 +40,16 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
  */
 #define list_next_rcu(list)	(*((struct list_head __rcu **)(&(list)->next)))
 
+/**
+ * list_tail_rcu - returns the prev pointer of the head of the list
+ * @head: the head of the list
+ *
+ * Note: This should only be used with the list header, and even then
+ * only if list_del() and similar primitives are not also used on the
+ * list header.
+ */
+#define list_tail_rcu(head)	(*((struct list_head __rcu **)(&(head)->prev)))
+
 /*
  * Check during list traversal that we are within an RCU reader
  */

From a35d16905efc6ad5523d864a5c6efcb1e657e386 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Mon, 5 Aug 2019 18:22:27 -0400
Subject: [PATCH 47/69] rcu: Add basic support for kfree_rcu() batching

Recently a discussion about stability and performance of a system
involving a high rate of kfree_rcu() calls surfaced on the list [1]
which led to another discussion how to prepare for this situation.

This patch adds basic batching support for kfree_rcu(). It is "basic"
because we do none of the slab management, dynamic allocation, code
moving or any of the other things, some of which previous attempts did
[2]. These fancier improvements can be follow-up patches and there are
different ideas being discussed in those regards. This is an effort to
start simple, and build up from there. In the future, an extension to
use kfree_bulk and possibly per-slab batching could be done to further
improve performance due to cache-locality and slab-specific bulk free
optimizations. By using an array of pointers, the worker thread
processing the work would need to read lesser data since it does not
need to deal with large rcu_head(s) any longer.

Torture tests follow in the next patch and show improvements of around
5x reduction in number of  grace periods on a 16 CPU system. More
details and test data are in that patch.

There is an implication with rcu_barrier() with this patch. Since the
kfree_rcu() calls can be batched, and may not be handed yet to the RCU
machinery in fact, the monitor may not have even run yet to do the
queue_rcu_work(), there seems no easy way of implementing rcu_barrier()
to wait for those kfree_rcu()s that are already made. So this means a
kfree_rcu() followed by an rcu_barrier() does not imply that memory will
be freed once rcu_barrier() returns.

Another implication is higher active memory usage (although not
run-away..) until the kfree_rcu() flooding ends, in comparison to
without batching. More details about this are in the second patch which
adds an rcuperf test.

Finally, in the near future we will get rid of kfree_rcu() special casing
within RCU such as in rcu_do_batch and switch everything to just
batching. Currently we don't do that since timer subsystem is not yet up
and we cannot schedule the kfree_rcu() monitor as the timer subsystem's
lock are not initialized. That would also mean getting rid of
kfree_call_rcu_nobatch() entirely.

[1] http://lore.kernel.org/lkml/20190723035725-mutt-send-email-mst@kernel.org
[2] https://lkml.org/lkml/2017/12/19/824

Cc: kernel-team@android.com
Cc: kernel-team@lge.com
Co-developed-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
[ paulmck: Applied 0day and Paul Walmsley feedback on ->monitor_todo. ]
[ paulmck: Make it work during early boot. ]
[ paulmck: Add a crude early boot self-test. ]
[ paulmck: Style adjustments and experimental docbook structure header. ]
Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.9999.1908161931110.32497@viisi.sifive.com/T/#me9956f66cb611b95d26ae92700e1d901f46e8c59
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h |   6 ++
 include/linux/rcutree.h |   2 +
 kernel/rcu/tree.c       | 196 ++++++++++++++++++++++++++++++++++++++--
 kernel/rcu/update.c     |  10 ++
 4 files changed, 207 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 37b6f0c2b79d..1bd166aab6f3 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -39,6 +39,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	call_rcu(head, func);
 }
 
+static inline void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
+{
+	call_rcu(head, func);
+}
+
 void rcu_qs(void);
 
 static inline void rcu_softirq_qs(void)
@@ -85,6 +90,7 @@ static inline void rcu_scheduler_starting(void) { }
 static inline void rcu_end_inkernel_boot(void) { }
 static inline bool rcu_is_watching(void) { return true; }
 static inline void rcu_momentary_dyntick_idle(void) { }
+static inline void kfree_rcu_scheduler_running(void) { }
 
 /* Avoid RCU read-side critical sections leaking across. */
 static inline void rcu_all_qs(void) { barrier(); }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index c5147de885ec..6a65d3a16dbd 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,10 +34,12 @@ static inline void rcu_virt_note_context_switch(int cpu)
 
 void synchronize_rcu_expedited(void);
 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
+void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
 
 void rcu_barrier(void);
 bool rcu_eqs_special_set(int cpu);
 void rcu_momentary_dyntick_idle(void);
+void kfree_rcu_scheduler_running(void);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1694a6b57ad8..0af016fdbf19 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2683,19 +2683,187 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
-/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
+
+/* Maximum number of jiffies to wait before draining a batch. */
+#define KFREE_DRAIN_JIFFIES (HZ / 50)
+
+/**
+ * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+ * @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @head_free: List of kfree_rcu() objects already waiting for a grace period
+ * @lock: Synchronize access to this structure
+ * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+ * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
+ * @initialized: The @lock and @rcu_work fields have been initialized
+ *
+ * This is a per-CPU structure.  The reason that it is not included in
+ * the rcu_data structure is to permit this code to be extracted from
+ * the RCU files.  Such extraction could allow further optimization of
+ * the interactions with the slab allocators.
  */
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+struct kfree_rcu_cpu {
+	struct rcu_work rcu_work;
+	struct rcu_head *head;
+	struct rcu_head *head_free;
+	spinlock_t lock;
+	struct delayed_work monitor_work;
+	int monitor_todo;
+	bool initialized;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+
+/*
+ * This function is invoked in workqueue context after a grace period.
+ * It frees all the objects queued on ->head_free.
+ */
+static void kfree_rcu_work(struct work_struct *work)
+{
+	unsigned long flags;
+	struct rcu_head *head, *next;
+	struct kfree_rcu_cpu *krcp;
+
+	krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work);
+	spin_lock_irqsave(&krcp->lock, flags);
+	head = krcp->head_free;
+	krcp->head_free = NULL;
+	spin_unlock_irqrestore(&krcp->lock, flags);
+
+	// List "head" is now private, so traverse locklessly.
+	for (; head; head = next) {
+		next = head->next;
+		// Potentially optimize with kfree_bulk in future.
+		__rcu_reclaim(rcu_state.name, head);
+		cond_resched_tasks_rcu_qs();
+	}
+}
+
+/*
+ * Schedule the kfree batch RCU work to run in workqueue context after a GP.
+ *
+ * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
+ * timeout has been reached.
+ */
+static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+{
+	lockdep_assert_held(&krcp->lock);
+
+	// If a previous RCU batch is in progress, we cannot immediately
+	// queue another one, so return false to tell caller to retry.
+	if (krcp->head_free)
+		return false;
+
+	krcp->head_free = krcp->head;
+	krcp->head = NULL;
+	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
+	queue_rcu_work(system_wq, &krcp->rcu_work);
+	return true;
+}
+
+static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
+					  unsigned long flags)
+{
+	// Attempt to start a new batch.
+	if (queue_kfree_rcu_work(krcp)) {
+		// Success! Our job is done here.
+		spin_unlock_irqrestore(&krcp->lock, flags);
+		return;
+	}
+
+	// Previous RCU batch still in progress, try again later.
+	if (!xchg(&krcp->monitor_todo, true))
+		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+	spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+/*
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+	unsigned long flags;
+	struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
+						 monitor_work.work);
+
+	spin_lock_irqsave(&krcp->lock, flags);
+	if (xchg(&krcp->monitor_todo, false))
+		kfree_rcu_drain_unlock(krcp, flags);
+	else
+		spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+/*
+ * This version of kfree_call_rcu does not do batching of kfree_rcu() requests.
+ * Used only by rcuperf torture test for comparison with kfree_rcu_batch().
+ */
+void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
 {
 	__call_rcu(head, func, 1);
 }
+EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
+
+/*
+ * Queue a request for lazy invocation of kfree() after a grace period.
+ *
+ * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
+ * will be kfree'd in workqueue context. This allows us to:
+ *
+ * 1.	Batch requests together to reduce the number of grace periods during
+ *	heavy kfree_rcu() load.
+ *
+ * 2.	It makes it possible to use kfree_bulk() on a large number of
+ *	kfree_rcu() requests thus reducing cache misses and the per-object
+ *	overhead of kfree().
+ */
+void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	unsigned long flags;
+	struct kfree_rcu_cpu *krcp;
+
+	head->func = func;
+
+	local_irq_save(flags);	// For safely calling this_cpu_ptr().
+	krcp = this_cpu_ptr(&krc);
+	if (krcp->initialized)
+		spin_lock(&krcp->lock);
+
+	// Queue the object but don't yet schedule the batch.
+	head->func = func;
+	head->next = krcp->head;
+	krcp->head = head;
+
+	// Set timer to drain after KFREE_DRAIN_JIFFIES.
+	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+	    !xchg(&krcp->monitor_todo, true))
+		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+
+	if (krcp->initialized)
+		spin_unlock(&krcp->lock);
+	local_irq_restore(flags);
+}
 EXPORT_SYMBOL_GPL(kfree_call_rcu);
 
+void __init kfree_rcu_scheduler_running(void)
+{
+	int cpu;
+	unsigned long flags;
+
+	for_each_online_cpu(cpu) {
+		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+		spin_lock_irqsave(&krcp->lock, flags);
+		if (!krcp->head || xchg(&krcp->monitor_todo, true)) {
+			spin_unlock_irqrestore(&krcp->lock, flags);
+			continue;
+		}
+		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+		spin_unlock_irqrestore(&krcp->lock, flags);
+	}
+}
+
 /*
  * During early boot, any blocking grace-period wait automatically
  * implies a grace period.  Later on, this is never the case for PREEMPT.
@@ -3557,12 +3725,26 @@ static void __init rcu_dump_rcu_node_tree(void)
 struct workqueue_struct *rcu_gp_wq;
 struct workqueue_struct *rcu_par_gp_wq;
 
+static void __init kfree_rcu_batch_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+		spin_lock_init(&krcp->lock);
+		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+		krcp->initialized = true;
+	}
+}
+
 void __init rcu_init(void)
 {
 	int cpu;
 
 	rcu_early_boot_tests();
 
+	kfree_rcu_batch_init();
 	rcu_bootup_announce();
 	rcu_init_geometry();
 	rcu_init_one();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1861103662db..196487762b96 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -40,6 +40,7 @@
 #include <linux/rcupdate_wait.h>
 #include <linux/sched/isolation.h>
 #include <linux/kprobes.h>
+#include <linux/slab.h>
 
 #define CREATE_TRACE_POINTS
 
@@ -218,6 +219,7 @@ static int __init rcu_set_runtime_mode(void)
 {
 	rcu_test_sync_prims();
 	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+	kfree_rcu_scheduler_running();
 	rcu_test_sync_prims();
 	return 0;
 }
@@ -853,14 +855,22 @@ static void test_callback(struct rcu_head *r)
 
 DEFINE_STATIC_SRCU(early_srcu);
 
+struct early_boot_kfree_rcu {
+	struct rcu_head rh;
+};
+
 static void early_boot_test_call_rcu(void)
 {
 	static struct rcu_head head;
 	static struct rcu_head shead;
+	struct early_boot_kfree_rcu *rhp;
 
 	call_rcu(&head, test_callback);
 	if (IS_ENABLED(CONFIG_SRCU))
 		call_srcu(&early_srcu, &shead, test_callback);
+	rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+	if (!WARN_ON_ONCE(!rhp))
+		kfree_rcu(rhp, rh);
 }
 
 void rcu_early_boot_tests(void)

From e6e78b004fa7e0ab455d46d27f218bf6ce178a18 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Fri, 30 Aug 2019 12:36:29 -0400
Subject: [PATCH 48/69] rcuperf: Add kfree_rcu() performance Tests

This test runs kfree_rcu() in a loop to measure performance of the new
kfree_rcu() batching functionality.

The following table shows results when booting with arguments:
rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000
rcuperf.kfree_rcu_test=1 rcuperf.kfree_no_batch=X

rcuperf.kfree_no_batch=X    # Grace Periods	Test Duration (s)
  X=1 (old behavior)              9133                 11.5
  X=0 (new behavior)              1732                 12.5

On a 16 CPU system with the above boot parameters, we see that the total
number of grace periods that elapse during the test drops from 9133 when
not batching to 1732 when batching (a 5X improvement). The kfree_rcu()
flood itself slows down a bit when batching, though, as shown.

Note that the active memory consumption during the kfree_rcu() flood
does increase to around 200-250MB due to the batching (from around 50MB
without batching). However, this memory consumption is relatively
constant. In other words, the system is able to keep up with the
kfree_rcu() load. The memory consumption comes down considerably if
KFREE_DRAIN_JIFFIES is increased from HZ/50 to HZ/80. A later patch will
reduce memory consumption further by using multiple lists.

Also, when running the test, please disable CONFIG_DEBUG_PREEMPT and
CONFIG_PROVE_RCU for realistic comparisons with/without batching.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  17 ++
 kernel/rcu/rcuperf.c                          | 181 +++++++++++++++++-
 2 files changed, 190 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ade4e6ec23e0..3ce270b56f3a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3978,6 +3978,23 @@
 			test until boot completes in order to avoid
 			interference.
 
+	rcuperf.kfree_rcu_test= [KNL]
+			Set to measure performance of kfree_rcu() flooding.
+
+	rcuperf.kfree_nthreads= [KNL]
+			The number of threads running loops of kfree_rcu().
+
+	rcuperf.kfree_alloc_num= [KNL]
+			Number of allocations and frees done in an iteration.
+
+	rcuperf.kfree_loops= [KNL]
+			Number of loops doing rcuperf.kfree_alloc_num number
+			of allocations and frees.
+
+	rcuperf.kfree_no_batch= [KNL]
+			Use the non-batching (less efficient) version of kfree_rcu().
+			This is useful for comparing with the batched version.
+
 	rcuperf.nreaders= [KNL]
 			Set number of RCU readers.  The value -1 selects
 			N, where N is the number of CPUs.  A value
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 5f884d560384..c1e25fd10f2a 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
 	      "Shutdown at end of performance tests.");
 torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
 torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
 
 static char *perf_type = "rcu";
 module_param(perf_type, charp, 0444);
@@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished;
 static wait_queue_head_t shutdown_wq;
 static u64 t_rcu_perf_writer_started;
 static u64 t_rcu_perf_writer_finished;
-static unsigned long b_rcu_perf_writer_started;
-static unsigned long b_rcu_perf_writer_finished;
+static unsigned long b_rcu_gp_test_started;
+static unsigned long b_rcu_gp_test_finished;
 static DEFINE_PER_CPU(atomic_t, n_async_inflight);
 
 #define MAX_MEAS 10000
@@ -378,10 +379,10 @@ rcu_perf_writer(void *arg)
 	if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
 		t_rcu_perf_writer_started = t;
 		if (gp_exp) {
-			b_rcu_perf_writer_started =
+			b_rcu_gp_test_started =
 				cur_ops->exp_completed() / 2;
 		} else {
-			b_rcu_perf_writer_started = cur_ops->get_gp_seq();
+			b_rcu_gp_test_started = cur_ops->get_gp_seq();
 		}
 	}
 
@@ -429,10 +430,10 @@ retry:
 				PERFOUT_STRING("Test complete");
 				t_rcu_perf_writer_finished = t;
 				if (gp_exp) {
-					b_rcu_perf_writer_finished =
+					b_rcu_gp_test_finished =
 						cur_ops->exp_completed() / 2;
 				} else {
-					b_rcu_perf_writer_finished =
+					b_rcu_gp_test_finished =
 						cur_ops->get_gp_seq();
 				}
 				if (shutdown) {
@@ -515,8 +516,8 @@ rcu_perf_cleanup(void)
 			 t_rcu_perf_writer_finished -
 			 t_rcu_perf_writer_started,
 			 ngps,
-			 rcuperf_seq_diff(b_rcu_perf_writer_finished,
-					  b_rcu_perf_writer_started));
+			 rcuperf_seq_diff(b_rcu_gp_test_finished,
+					  b_rcu_gp_test_started));
 		for (i = 0; i < nrealwriters; i++) {
 			if (!writer_durations)
 				break;
@@ -584,6 +585,167 @@ rcu_perf_shutdown(void *arg)
 	return -EINVAL;
 }
 
+/*
+ * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number
+ * of iterations and measure total time and number of GP for all iterations to complete.
+ */
+
+torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
+torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
+torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
+torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu().");
+
+static struct task_struct **kfree_reader_tasks;
+static int kfree_nrealthreads;
+static atomic_t n_kfree_perf_thread_started;
+static atomic_t n_kfree_perf_thread_ended;
+
+struct kfree_obj {
+	char kfree_obj[8];
+	struct rcu_head rh;
+};
+
+static int
+kfree_perf_thread(void *arg)
+{
+	int i, loop = 0;
+	long me = (long)arg;
+	struct kfree_obj *alloc_ptr;
+	u64 start_time, end_time;
+
+	VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+
+	start_time = ktime_get_mono_fast_ns();
+
+	if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) {
+		if (gp_exp)
+			b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
+		else
+			b_rcu_gp_test_started = cur_ops->get_gp_seq();
+	}
+
+	do {
+		for (i = 0; i < kfree_alloc_num; i++) {
+			alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
+			if (!alloc_ptr)
+				return -ENOMEM;
+
+			if (!kfree_no_batch) {
+				kfree_rcu(alloc_ptr, rh);
+			} else {
+				rcu_callback_t cb;
+
+				cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
+				kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb);
+			}
+		}
+
+		cond_resched();
+	} while (!torture_must_stop() && ++loop < kfree_loops);
+
+	if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
+		end_time = ktime_get_mono_fast_ns();
+
+		if (gp_exp)
+			b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
+		else
+			b_rcu_gp_test_finished = cur_ops->get_gp_seq();
+
+		pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n",
+		       (unsigned long long)(end_time - start_time), kfree_loops,
+		       rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started));
+		if (shutdown) {
+			smp_mb(); /* Assign before wake. */
+			wake_up(&shutdown_wq);
+		}
+	}
+
+	torture_kthread_stopping("kfree_perf_thread");
+	return 0;
+}
+
+static void
+kfree_perf_cleanup(void)
+{
+	int i;
+
+	if (torture_cleanup_begin())
+		return;
+
+	if (kfree_reader_tasks) {
+		for (i = 0; i < kfree_nrealthreads; i++)
+			torture_stop_kthread(kfree_perf_thread,
+					     kfree_reader_tasks[i]);
+		kfree(kfree_reader_tasks);
+	}
+
+	torture_cleanup_end();
+}
+
+/*
+ * shutdown kthread.  Just waits to be awakened, then shuts down system.
+ */
+static int
+kfree_perf_shutdown(void *arg)
+{
+	do {
+		wait_event(shutdown_wq,
+			   atomic_read(&n_kfree_perf_thread_ended) >=
+			   kfree_nrealthreads);
+	} while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
+
+	smp_mb(); /* Wake before output. */
+
+	kfree_perf_cleanup();
+	kernel_power_off();
+	return -EINVAL;
+}
+
+static int __init
+kfree_perf_init(void)
+{
+	long i;
+	int firsterr = 0;
+
+	kfree_nrealthreads = compute_real(kfree_nthreads);
+	/* Start up the kthreads. */
+	if (shutdown) {
+		init_waitqueue_head(&shutdown_wq);
+		firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
+						  shutdown_task);
+		if (firsterr)
+			goto unwind;
+		schedule_timeout_uninterruptible(1);
+	}
+
+	kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
+			       GFP_KERNEL);
+	if (kfree_reader_tasks == NULL) {
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+
+	for (i = 0; i < kfree_nrealthreads; i++) {
+		firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
+						  kfree_reader_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+
+	while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
+		schedule_timeout_uninterruptible(1);
+
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	kfree_perf_cleanup();
+	return firsterr;
+}
+
 static int __init
 rcu_perf_init(void)
 {
@@ -616,6 +778,9 @@ rcu_perf_init(void)
 	if (cur_ops->init)
 		cur_ops->init();
 
+	if (kfree_rcu_test)
+		return kfree_perf_init();
+
 	nrealwriters = compute_real(nwriters);
 	nrealreaders = compute_real(nreaders);
 	atomic_set(&n_rcu_perf_reader_started, 0);

From 569d767087ef56a59bc2d1d5f25ee447eeae442a Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joel@joelfernandes.org>
Date: Sun, 22 Sep 2019 10:49:57 -0700
Subject: [PATCH 49/69] rcu: Make kfree_rcu() use a non-atomic ->monitor_todo

Because the ->monitor_todo field is always protected by krcp->lock,
this commit downgrades from xchg() to non-atomic unmarked assignment
statements.

Signed-off-by: Joel Fernandes <joel@joelfernandes.org>
[ paulmck: Update to include early-boot kick code. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0af016fdbf19..6106b9e0b5fb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2708,7 +2708,7 @@ struct kfree_rcu_cpu {
 	struct rcu_head *head_free;
 	spinlock_t lock;
 	struct delayed_work monitor_work;
-	int monitor_todo;
+	bool monitor_todo;
 	bool initialized;
 };
 
@@ -2765,6 +2765,7 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
 					  unsigned long flags)
 {
 	// Attempt to start a new batch.
+	krcp->monitor_todo = false;
 	if (queue_kfree_rcu_work(krcp)) {
 		// Success! Our job is done here.
 		spin_unlock_irqrestore(&krcp->lock, flags);
@@ -2772,8 +2773,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
 	}
 
 	// Previous RCU batch still in progress, try again later.
-	if (!xchg(&krcp->monitor_todo, true))
-		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+	krcp->monitor_todo = true;
+	schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
 	spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
@@ -2788,7 +2789,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
 						 monitor_work.work);
 
 	spin_lock_irqsave(&krcp->lock, flags);
-	if (xchg(&krcp->monitor_todo, false))
+	if (krcp->monitor_todo)
 		kfree_rcu_drain_unlock(krcp, flags);
 	else
 		spin_unlock_irqrestore(&krcp->lock, flags);
@@ -2837,8 +2838,10 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 
 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
-	    !xchg(&krcp->monitor_todo, true))
+	    !krcp->monitor_todo) {
+		krcp->monitor_todo = true;
 		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+	}
 
 	if (krcp->initialized)
 		spin_unlock(&krcp->lock);
@@ -2855,10 +2858,11 @@ void __init kfree_rcu_scheduler_running(void)
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		spin_lock_irqsave(&krcp->lock, flags);
-		if (!krcp->head || xchg(&krcp->monitor_todo, true)) {
+		if (!krcp->head || krcp->monitor_todo) {
 			spin_unlock_irqrestore(&krcp->lock, flags);
 			continue;
 		}
+		krcp->monitor_todo = true;
 		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
 		spin_unlock_irqrestore(&krcp->lock, flags);
 	}

From 0392bebebf26f09434e6c7ca4c09c014efeef76a Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Thu, 19 Sep 2019 14:58:26 -0700
Subject: [PATCH 50/69] rcu: Add multiple in-flight batches of kfree_rcu() work

During testing, it was observed that amount of memory consumed due
kfree_rcu() batching is 300-400MB. Previously we had only a single
head_free pointer pointing to the list of rcu_head(s) that are to be
freed after a grace period. Until this list is drained, we cannot queue
any more objects on it since such objects may not be ready to be
reclaimed when the worker thread eventually gets to drainin g the
head_free list.

We can do better by maintaining multiple lists as done by this patch.
Testing shows that memory consumption came down by around 100-150MB with
just adding another list. Adding more than 1 additional list did not
show any improvement.

Suggested-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
[ paulmck: Code style and initialization handling. ]
[ paulmck: Fix field name, reported by kbuild test robot <lkp@intel.com>. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 51 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6106b9e0b5fb..a40fd58bd4b6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2686,12 +2686,25 @@ EXPORT_SYMBOL_GPL(call_rcu);
 
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_N_BATCHES 2
+
+/**
+ * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+ * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @krcp: Pointer to @kfree_rcu_cpu structure
+ */
+
+struct kfree_rcu_cpu_work {
+	struct rcu_work rcu_work;
+	struct rcu_head *head_free;
+	struct kfree_rcu_cpu *krcp;
+};
 
 /**
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
- * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @head_free: List of kfree_rcu() objects already waiting for a grace period
+ * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
  * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
@@ -2703,9 +2716,8 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * the interactions with the slab allocators.
  */
 struct kfree_rcu_cpu {
-	struct rcu_work rcu_work;
 	struct rcu_head *head;
-	struct rcu_head *head_free;
+	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	spinlock_t lock;
 	struct delayed_work monitor_work;
 	bool monitor_todo;
@@ -2723,11 +2735,14 @@ static void kfree_rcu_work(struct work_struct *work)
 	unsigned long flags;
 	struct rcu_head *head, *next;
 	struct kfree_rcu_cpu *krcp;
+	struct kfree_rcu_cpu_work *krwp;
 
-	krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work);
+	krwp = container_of(to_rcu_work(work),
+			    struct kfree_rcu_cpu_work, rcu_work);
+	krcp = krwp->krcp;
 	spin_lock_irqsave(&krcp->lock, flags);
-	head = krcp->head_free;
-	krcp->head_free = NULL;
+	head = krwp->head_free;
+	krwp->head_free = NULL;
 	spin_unlock_irqrestore(&krcp->lock, flags);
 
 	// List "head" is now private, so traverse locklessly.
@@ -2747,17 +2762,25 @@ static void kfree_rcu_work(struct work_struct *work)
  */
 static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
+	int i;
+	struct kfree_rcu_cpu_work *krwp = NULL;
+
 	lockdep_assert_held(&krcp->lock);
+	for (i = 0; i < KFREE_N_BATCHES; i++)
+		if (!krcp->krw_arr[i].head_free) {
+			krwp = &(krcp->krw_arr[i]);
+			break;
+		}
 
 	// If a previous RCU batch is in progress, we cannot immediately
 	// queue another one, so return false to tell caller to retry.
-	if (krcp->head_free)
+	if (!krwp)
 		return false;
 
-	krcp->head_free = krcp->head;
+	krwp->head_free = krcp->head;
 	krcp->head = NULL;
-	INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work);
-	queue_rcu_work(system_wq, &krcp->rcu_work);
+	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
+	queue_rcu_work(system_wq, &krwp->rcu_work);
 	return true;
 }
 
@@ -2863,7 +2886,8 @@ void __init kfree_rcu_scheduler_running(void)
 			continue;
 		}
 		krcp->monitor_todo = true;
-		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+		schedule_delayed_work_on(cpu, &krcp->monitor_work,
+					 KFREE_DRAIN_JIFFIES);
 		spin_unlock_irqrestore(&krcp->lock, flags);
 	}
 }
@@ -3732,11 +3756,14 @@ struct workqueue_struct *rcu_par_gp_wq;
 static void __init kfree_rcu_batch_init(void)
 {
 	int cpu;
+	int i;
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		spin_lock_init(&krcp->lock);
+		for (i = 0; i < KFREE_N_BATCHES; i++)
+			krcp->krw_arr[i].krcp = krcp;
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 		krcp->initialized = true;
 	}

From e99637becb2e684bee2b9117f817f4d1346b8353 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sun, 22 Sep 2019 13:03:17 -0700
Subject: [PATCH 51/69] rcu: Add support for debug_objects debugging for
 kfree_rcu()

This commit applies RCU's debug_objects debugging to the new batched
kfree_rcu() implementations.  The object is queued at the kfree_rcu()
call and dequeued during reclaim.

Tested that enabling CONFIG_DEBUG_OBJECTS_RCU_HEAD successfully detects
double kfree_rcu() calls.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
[ paulmck: Fix IRQ per kbuild test robot <lkp@intel.com> feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a40fd58bd4b6..0512221cd84b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2749,6 +2749,7 @@ static void kfree_rcu_work(struct work_struct *work)
 	for (; head; head = next) {
 		next = head->next;
 		// Potentially optimize with kfree_bulk in future.
+		debug_rcu_head_unqueue(head);
 		__rcu_reclaim(rcu_state.name, head);
 		cond_resched_tasks_rcu_qs();
 	}
@@ -2855,6 +2856,12 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 		spin_lock(&krcp->lock);
 
 	// Queue the object but don't yet schedule the batch.
+	if (debug_rcu_head_queue(head)) {
+		// Probable double kfree_rcu(), just leak.
+		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
+			  __func__, head);
+		goto unlock_return;
+	}
 	head->func = func;
 	head->next = krcp->head;
 	krcp->head = head;
@@ -2866,6 +2873,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
 	}
 
+unlock_return:
 	if (krcp->initialized)
 		spin_unlock(&krcp->lock);
 	local_irq_restore(flags);

From 77a40f97030b27b3fc1640a3ed203870f0817f57 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Fri, 30 Aug 2019 12:36:32 -0400
Subject: [PATCH 52/69] rcu: Remove kfree_rcu() special casing and
 lazy-callback handling

This commit removes kfree_rcu() special-casing and the lazy-callback
handling from Tree RCU.  It moves some of this special casing to Tiny RCU,
the removal of which will be the subject of later commits.

This results in a nice negative delta.

Suggested-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
[ paulmck: Add slab.h #include, thanks to kbuild test robot <lkp@intel.com>. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/stallwarn.txt | 11 +++-----
 include/linux/rcu_segcblist.h   |  2 --
 include/trace/events/rcu.h      | 32 +++++++++-------------
 kernel/rcu/rcu.h                | 27 -------------------
 kernel/rcu/rcu_segcblist.c      | 25 +++--------------
 kernel/rcu/rcu_segcblist.h      | 25 ++---------------
 kernel/rcu/srcutree.c           |  4 +--
 kernel/rcu/tiny.c               | 28 ++++++++++++++++++-
 kernel/rcu/tree.c               | 40 ++++++++++++++++++---------
 kernel/rcu/tree.h               |  1 -
 kernel/rcu/tree_plugin.h        | 48 ++++++++-------------------------
 kernel/rcu/tree_stall.h         |  6 ++---
 12 files changed, 90 insertions(+), 159 deletions(-)

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index f48f4621ccbc..a360a8796710 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs
 In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
 for each CPU:
 
-	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D
+	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
 
 The "last_accelerate:" prints the low-order 16 bits (in hex) of the
 jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
 from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
-rcu_prepare_for_idle().  The "Nonlazy posted:" indicates lazy-callback
-status, so that an "l" indicates that all callbacks were lazy at the start
-of the last idle period and an "L" indicates that there are currently
-no non-lazy callbacks (in both cases, "." is printed otherwise, as
-shown above) and "D" indicates that dyntick-idle processing is enabled
-("." is printed otherwise, for example, if disabled via the "nohz="
-kernel boot parameter).
+rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
+processing is enabled.
 
 If the grace period ends just as the stall warning starts printing,
 there will be a spurious stall-warning message, which will include
diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 646759042333..b36afe7b22c9 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -22,7 +22,6 @@ struct rcu_cblist {
 	struct rcu_head *head;
 	struct rcu_head **tail;
 	long len;
-	long len_lazy;
 };
 
 #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
@@ -73,7 +72,6 @@ struct rcu_segcblist {
 #else
 	long len;
 #endif
-	long len_lazy;
 	u8 enabled;
 	u8 offloaded;
 };
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 66122602bd08..4ab16fcda895 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -481,16 +481,14 @@ TRACE_EVENT_RCU(rcu_dyntick,
  */
 TRACE_EVENT_RCU(rcu_callback,
 
-	TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen_lazy,
-		 long qlen),
+	TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen),
 
-	TP_ARGS(rcuname, rhp, qlen_lazy, qlen),
+	TP_ARGS(rcuname, rhp, qlen),
 
 	TP_STRUCT__entry(
 		__field(const char *, rcuname)
 		__field(void *, rhp)
 		__field(void *, func)
-		__field(long, qlen_lazy)
 		__field(long, qlen)
 	),
 
@@ -498,13 +496,12 @@ TRACE_EVENT_RCU(rcu_callback,
 		__entry->rcuname = rcuname;
 		__entry->rhp = rhp;
 		__entry->func = rhp->func;
-		__entry->qlen_lazy = qlen_lazy;
 		__entry->qlen = qlen;
 	),
 
-	TP_printk("%s rhp=%p func=%ps %ld/%ld",
+	TP_printk("%s rhp=%p func=%ps %ld",
 		  __entry->rcuname, __entry->rhp, __entry->func,
-		  __entry->qlen_lazy, __entry->qlen)
+		  __entry->qlen)
 );
 
 /*
@@ -518,15 +515,14 @@ TRACE_EVENT_RCU(rcu_callback,
 TRACE_EVENT_RCU(rcu_kfree_callback,
 
 	TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset,
-		 long qlen_lazy, long qlen),
+		 long qlen),
 
-	TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen),
+	TP_ARGS(rcuname, rhp, offset, qlen),
 
 	TP_STRUCT__entry(
 		__field(const char *, rcuname)
 		__field(void *, rhp)
 		__field(unsigned long, offset)
-		__field(long, qlen_lazy)
 		__field(long, qlen)
 	),
 
@@ -534,13 +530,12 @@ TRACE_EVENT_RCU(rcu_kfree_callback,
 		__entry->rcuname = rcuname;
 		__entry->rhp = rhp;
 		__entry->offset = offset;
-		__entry->qlen_lazy = qlen_lazy;
 		__entry->qlen = qlen;
 	),
 
-	TP_printk("%s rhp=%p func=%ld %ld/%ld",
+	TP_printk("%s rhp=%p func=%ld %ld",
 		  __entry->rcuname, __entry->rhp, __entry->offset,
-		  __entry->qlen_lazy, __entry->qlen)
+		  __entry->qlen)
 );
 
 /*
@@ -552,27 +547,24 @@ TRACE_EVENT_RCU(rcu_kfree_callback,
  */
 TRACE_EVENT_RCU(rcu_batch_start,
 
-	TP_PROTO(const char *rcuname, long qlen_lazy, long qlen, long blimit),
+	TP_PROTO(const char *rcuname, long qlen, long blimit),
 
-	TP_ARGS(rcuname, qlen_lazy, qlen, blimit),
+	TP_ARGS(rcuname, qlen, blimit),
 
 	TP_STRUCT__entry(
 		__field(const char *, rcuname)
-		__field(long, qlen_lazy)
 		__field(long, qlen)
 		__field(long, blimit)
 	),
 
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
-		__entry->qlen_lazy = qlen_lazy;
 		__entry->qlen = qlen;
 		__entry->blimit = blimit;
 	),
 
-	TP_printk("%s CBs=%ld/%ld bl=%ld",
-		  __entry->rcuname, __entry->qlen_lazy, __entry->qlen,
-		  __entry->blimit)
+	TP_printk("%s CBs=%ld bl=%ld",
+		  __entry->rcuname, __entry->qlen, __entry->blimit)
 );
 
 /*
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ab504fbc76ca..c30a1f7dbd15 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -198,33 +198,6 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 }
 #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
-void kfree(const void *);
-
-/*
- * Reclaim the specified callback, either by invoking it (non-lazy case)
- * or freeing it directly (lazy case).  Return true if lazy, false otherwise.
- */
-static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
-{
-	rcu_callback_t f;
-	unsigned long offset = (unsigned long)head->func;
-
-	rcu_lock_acquire(&rcu_callback_map);
-	if (__is_kfree_rcu_offset(offset)) {
-		trace_rcu_invoke_kfree_callback(rn, head, offset);
-		kfree((void *)head - offset);
-		rcu_lock_release(&rcu_callback_map);
-		return true;
-	} else {
-		trace_rcu_invoke_callback(rn, head);
-		f = head->func;
-		WRITE_ONCE(head->func, (rcu_callback_t)0L);
-		f(head);
-		rcu_lock_release(&rcu_callback_map);
-		return false;
-	}
-}
-
 #ifdef CONFIG_RCU_STALL_COMMON
 
 extern int rcu_cpu_stall_ftrace_dump;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index cbc87b804db9..5f4fd3b8777c 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -20,14 +20,10 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
 	rclp->head = NULL;
 	rclp->tail = &rclp->head;
 	rclp->len = 0;
-	rclp->len_lazy = 0;
 }
 
 /*
  * Enqueue an rcu_head structure onto the specified callback list.
- * This function assumes that the callback is non-lazy because it
- * is intended for use by no-CBs CPUs, which do not distinguish
- * between lazy and non-lazy RCU callbacks.
  */
 void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
 {
@@ -54,7 +50,6 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
 	else
 		drclp->tail = &drclp->head;
 	drclp->len = srclp->len;
-	drclp->len_lazy = srclp->len_lazy;
 	if (!rhp) {
 		rcu_cblist_init(srclp);
 	} else {
@@ -62,16 +57,12 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
 		srclp->head = rhp;
 		srclp->tail = &rhp->next;
 		WRITE_ONCE(srclp->len, 1);
-		srclp->len_lazy = 0;
 	}
 }
 
 /*
  * Dequeue the oldest rcu_head structure from the specified callback
- * list.  This function assumes that the callback is non-lazy, but
- * the caller can later invoke rcu_cblist_dequeued_lazy() if it
- * finds otherwise (and if it cares about laziness).  This allows
- * different users to have different ways of determining laziness.
+ * list.
  */
 struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
 {
@@ -161,7 +152,6 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
 	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
 		rsclp->tails[i] = &rsclp->head;
 	rcu_segcblist_set_len(rsclp, 0);
-	rsclp->len_lazy = 0;
 	rsclp->enabled = 1;
 }
 
@@ -173,7 +163,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
 {
 	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
 	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
 	rsclp->enabled = 0;
 }
 
@@ -253,11 +242,9 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
  * absolutely not OK for it to ever miss posting a callback.
  */
 void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
-			   struct rcu_head *rhp, bool lazy)
+			   struct rcu_head *rhp)
 {
 	rcu_segcblist_inc_len(rsclp);
-	if (lazy)
-		rsclp->len_lazy++;
 	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
 	rhp->next = NULL;
 	WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
@@ -275,15 +262,13 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
  * period.  You have been warned.
  */
 bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
-			   struct rcu_head *rhp, bool lazy)
+			   struct rcu_head *rhp)
 {
 	int i;
 
 	if (rcu_segcblist_n_cbs(rsclp) == 0)
 		return false;
 	rcu_segcblist_inc_len(rsclp);
-	if (lazy)
-		rsclp->len_lazy++;
 	smp_mb(); /* Ensure counts are updated before callback is entrained. */
 	rhp->next = NULL;
 	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
@@ -307,8 +292,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
 void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
 					       struct rcu_cblist *rclp)
 {
-	rclp->len_lazy += rsclp->len_lazy;
-	rsclp->len_lazy = 0;
 	rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
 }
 
@@ -361,9 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
 void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
 				struct rcu_cblist *rclp)
 {
-	rsclp->len_lazy += rclp->len_lazy;
 	rcu_segcblist_add_len(rsclp, rclp->len);
-	rclp->len_lazy = 0;
 	rclp->len = 0;
 }
 
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 815c2fdd3fcc..5c293afc07b8 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,15 +15,6 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
 	return READ_ONCE(rclp->len);
 }
 
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
-{
-	rclp->len_lazy--;
-}
-
 void rcu_cblist_init(struct rcu_cblist *rclp);
 void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
 void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
@@ -59,18 +50,6 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
 #endif
 }
 
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len_lazy;
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy;
-}
-
 /*
  * Is the specified rcu_segcblist enabled, for example, not corresponding
  * to an offline CPU?
@@ -106,9 +85,9 @@ struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
 bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
 void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
-			   struct rcu_head *rhp, bool lazy);
+			   struct rcu_head *rhp);
 bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
-			   struct rcu_head *rhp, bool lazy);
+			   struct rcu_head *rhp);
 void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
 				 struct rcu_cblist *rclp);
 void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 5dffade2d7cd..d0a9d5b69087 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -853,7 +853,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 	local_irq_save(flags);
 	sdp = this_cpu_ptr(ssp->sda);
 	spin_lock_rcu_node(sdp);
-	rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
+	rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_gp_seq));
 	s = rcu_seq_snap(&ssp->srcu_gp_seq);
@@ -1052,7 +1052,7 @@ void srcu_barrier(struct srcu_struct *ssp)
 		sdp->srcu_barrier_head.func = srcu_barrier_cb;
 		debug_rcu_head_queue(&sdp->srcu_barrier_head);
 		if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
-					   &sdp->srcu_barrier_head, 0)) {
+					   &sdp->srcu_barrier_head)) {
 			debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
 			atomic_dec(&ssp->srcu_barrier_cpu_cnt);
 		}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 477b4eb44af5..dd572ce7c747 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -22,6 +22,7 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
+#include <linux/slab.h>
 
 #include "rcu.h"
 
@@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user)
 	}
 }
 
+/*
+ * Reclaim the specified callback, either by invoking it for non-kfree cases or
+ * freeing it directly (for kfree). Return true if kfreeing, false otherwise.
+ */
+static inline bool rcu_reclaim_tiny(struct rcu_head *head)
+{
+	rcu_callback_t f;
+	unsigned long offset = (unsigned long)head->func;
+
+	rcu_lock_acquire(&rcu_callback_map);
+	if (__is_kfree_rcu_offset(offset)) {
+		trace_rcu_invoke_kfree_callback("", head, offset);
+		kfree((void *)head - offset);
+		rcu_lock_release(&rcu_callback_map);
+		return true;
+	}
+
+	trace_rcu_invoke_callback("", head);
+	f = head->func;
+	WRITE_ONCE(head->func, (rcu_callback_t)0L);
+	f(head);
+	rcu_lock_release(&rcu_callback_map);
+	return false;
+}
+
 /* Invoke the RCU callbacks whose grace period has elapsed.  */
 static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
 {
@@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
 		local_bh_disable();
-		__rcu_reclaim("", list);
+		rcu_reclaim_tiny(list);
 		local_bh_enable();
 		list = next;
 	}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0512221cd84b..a8dd612098bf 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -55,6 +55,7 @@
 #include <linux/oom.h>
 #include <linux/smpboot.h>
 #include <linux/jiffies.h>
+#include <linux/slab.h>
 #include <linux/sched/isolation.h>
 #include <linux/sched/clock.h>
 #include "../time/tick-internal.h"
@@ -2146,7 +2147,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	/* If no callbacks are ready, just return. */
 	if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
 		trace_rcu_batch_start(rcu_state.name,
-				      rcu_segcblist_n_lazy_cbs(&rdp->cblist),
 				      rcu_segcblist_n_cbs(&rdp->cblist), 0);
 		trace_rcu_batch_end(rcu_state.name, 0,
 				    !rcu_segcblist_empty(&rdp->cblist),
@@ -2168,7 +2168,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	if (unlikely(bl > 100))
 		tlimit = local_clock() + rcu_resched_ns;
 	trace_rcu_batch_start(rcu_state.name,
-			      rcu_segcblist_n_lazy_cbs(&rdp->cblist),
 			      rcu_segcblist_n_cbs(&rdp->cblist), bl);
 	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
 	if (offloaded)
@@ -2179,9 +2178,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	tick_dep_set_task(current, TICK_DEP_BIT_RCU);
 	rhp = rcu_cblist_dequeue(&rcl);
 	for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+		rcu_callback_t f;
+
 		debug_rcu_head_unqueue(rhp);
-		if (__rcu_reclaim(rcu_state.name, rhp))
-			rcu_cblist_dequeued_lazy(&rcl);
+
+		rcu_lock_acquire(&rcu_callback_map);
+		trace_rcu_invoke_callback(rcu_state.name, rhp);
+
+		f = rhp->func;
+		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
+		f(rhp);
+
+		rcu_lock_release(&rcu_callback_map);
+
 		/*
 		 * Stop only if limit reached and CPU has something to do.
 		 * Note: The rcl structure counts down from zero.
@@ -2583,7 +2592,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
  * is expected to specify a CPU.
  */
 static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
+__call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;
 	struct rcu_data *rdp;
@@ -2618,18 +2627,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
 		if (rcu_segcblist_empty(&rdp->cblist))
 			rcu_segcblist_init(&rdp->cblist);
 	}
+
 	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
 		return; // Enqueued onto ->nocb_bypass, so just leave.
 	/* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
-	rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
+	rcu_segcblist_enqueue(&rdp->cblist, head);
 	if (__is_kfree_rcu_offset((unsigned long)func))
 		trace_rcu_kfree_callback(rcu_state.name, head,
 					 (unsigned long)func,
-					 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
 					 rcu_segcblist_n_cbs(&rdp->cblist));
 	else
 		trace_rcu_callback(rcu_state.name, head,
-				   rcu_segcblist_n_lazy_cbs(&rdp->cblist),
 				   rcu_segcblist_n_cbs(&rdp->cblist));
 
 	/* Go handle any RCU core processing required. */
@@ -2679,7 +2687,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
  */
 void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
-	__call_rcu(head, func, 0);
+	__call_rcu(head, func);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
@@ -2747,10 +2755,18 @@ static void kfree_rcu_work(struct work_struct *work)
 
 	// List "head" is now private, so traverse locklessly.
 	for (; head; head = next) {
+		unsigned long offset = (unsigned long)head->func;
+
 		next = head->next;
 		// Potentially optimize with kfree_bulk in future.
 		debug_rcu_head_unqueue(head);
-		__rcu_reclaim(rcu_state.name, head);
+		rcu_lock_acquire(&rcu_callback_map);
+		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+
+		/* Could be possible to optimize with kfree_bulk in future */
+		kfree((void *)head - offset);
+
+		rcu_lock_release(&rcu_callback_map);
 		cond_resched_tasks_rcu_qs();
 	}
 }
@@ -2825,7 +2841,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
  */
 void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
 {
-	__call_rcu(head, func, 1);
+	__call_rcu(head, func);
 }
 EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
 
@@ -3100,7 +3116,7 @@ static void rcu_barrier_func(void *unused)
 	debug_rcu_head_queue(&rdp->barrier_head);
 	rcu_nocb_lock(rdp);
 	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
-	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
+	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
 		atomic_inc(&rcu_state.barrier_cpu_count);
 	} else {
 		debug_rcu_head_unqueue(&rdp->barrier_head);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 055c31781d3a..15405420b40c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -183,7 +183,6 @@ struct rcu_data {
 	bool rcu_urgent_qs;		/* GP old need light quiescent state. */
 	bool rcu_forced_tick;		/* Forced tick to provide QS. */
 #ifdef CONFIG_RCU_FAST_NO_HZ
-	bool all_lazy;			/* All CPU's CBs lazy at idle start? */
 	unsigned long last_accelerate;	/* Last jiffy CBs were accelerated. */
 	unsigned long last_advance_all;	/* Last jiffy CBs were all advanced. */
 	int tick_nohz_enabled_snap;	/* Previously seen value from sysfs. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fa08d55f7040..d5334e49ccca 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1262,10 +1262,9 @@ static void rcu_prepare_for_idle(void)
 /*
  * This code is invoked when a CPU goes idle, at which point we want
  * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode.  This is handled by a
- * state machine implemented by rcu_prepare_for_idle() below.
+ * the energy-efficient dyntick-idle mode.
  *
- * The following three proprocessor symbols control this state machine:
+ * The following preprocessor symbol controls this:
  *
  * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
  *	to sleep in dyntick-idle mode with RCU callbacks pending.  This
@@ -1274,21 +1273,15 @@ static void rcu_prepare_for_idle(void)
  *	number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
  *	system.  And if you are -that- concerned about energy efficiency,
  *	just power the system down and be done with it!
- * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
- *	permitted to sleep in dyntick-idle mode with only lazy RCU
- *	callbacks pending.  Setting this too high can OOM your system.
  *
- * The values below work well in practice.  If future workloads require
+ * The value below works well in practice.  If future workloads require
  * adjustment, they can be converted into kernel config parameters, though
  * making the state machine smarter might be a better option.
  */
 #define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */
-#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */
 
 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
 module_param(rcu_idle_gp_delay, int, 0644);
-static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
-module_param(rcu_idle_lazy_gp_delay, int, 0644);
 
 /*
  * Try to advance callbacks on the current CPU, but only if it has been
@@ -1327,8 +1320,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
 /*
  * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
  * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
- * caller to set the timeout based on whether or not there are non-lazy
- * callbacks.
+ * caller about what to set the timeout.
  *
  * The caller must have disabled interrupts.
  */
@@ -1354,25 +1346,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 	}
 	rdp->last_accelerate = jiffies;
 
-	/* Request timer delay depending on laziness, and round. */
-	rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist);
-	if (rdp->all_lazy) {
-		dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
-	} else {
-		dj = round_up(rcu_idle_gp_delay + jiffies,
-			       rcu_idle_gp_delay) - jiffies;
-	}
+	/* Request timer and round. */
+	dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
+
 	*nextevt = basemono + dj * TICK_NSEC;
 	return 0;
 }
 
 /*
- * Prepare a CPU for idle from an RCU perspective.  The first major task
- * is to sense whether nohz mode has been enabled or disabled via sysfs.
- * The second major task is to check to see if a non-lazy callback has
- * arrived at a CPU that previously had only lazy callbacks.  The third
- * major task is to accelerate (that is, assign grace-period numbers to)
- * any recently arrived callbacks.
+ * Prepare a CPU for idle from an RCU perspective.  The first major task is to
+ * sense whether nohz mode has been enabled or disabled via sysfs.  The second
+ * major task is to accelerate (that is, assign grace-period numbers to) any
+ * recently arrived callbacks.
  *
  * The caller must have disabled interrupts.
  */
@@ -1398,17 +1383,6 @@ static void rcu_prepare_for_idle(void)
 	if (!tne)
 		return;
 
-	/*
-	 * If a non-lazy callback arrived at a CPU having only lazy
-	 * callbacks, invoke RCU core for the side-effect of recalculating
-	 * idle duration on re-entry to idle.
-	 */
-	if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) {
-		rdp->all_lazy = false;
-		invoke_rcu_core();
-		return;
-	}
-
 	/*
 	 * If we have not yet accelerated this jiffy, accelerate all
 	 * callbacks on this CPU.
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index c0b8c458d8a6..806f2ddc8f74 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 
-	sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
+	sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
 		rdp->last_accelerate & 0xffff, jiffies & 0xffff,
-		".l"[rdp->all_lazy],
-		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
-		".D"[!!rdp->tick_nohz_enabled_snap]);
+		!!rdp->tick_nohz_enabled_snap);
 }
 
 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */

From 189a6883dcf7fa70e17403ae4225c60ffc9e404b Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Fri, 30 Aug 2019 12:36:33 -0400
Subject: [PATCH 53/69] rcu: Remove kfree_call_rcu_nobatch()

Now that the kfree_rcu() special-casing has been removed from tree RCU,
this commit removes kfree_call_rcu_nobatch() since it is no longer needed.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../admin-guide/kernel-parameters.txt          |  4 ----
 include/linux/rcutiny.h                        |  5 -----
 include/linux/rcutree.h                        |  1 -
 kernel/rcu/rcuperf.c                           | 10 +---------
 kernel/rcu/tree.c                              | 18 ++++--------------
 5 files changed, 5 insertions(+), 33 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3ce270b56f3a..ed83d6d90cc3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3991,10 +3991,6 @@
 			Number of loops doing rcuperf.kfree_alloc_num number
 			of allocations and frees.
 
-	rcuperf.kfree_no_batch= [KNL]
-			Use the non-batching (less efficient) version of kfree_rcu().
-			This is useful for comparing with the batched version.
-
 	rcuperf.nreaders= [KNL]
 			Set number of RCU readers.  The value -1 selects
 			N, where N is the number of CPUs.  A value
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 1bd166aab6f3..b2b2dc990da9 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -39,11 +39,6 @@ static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	call_rcu(head, func);
 }
 
-static inline void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
-{
-	call_rcu(head, func);
-}
-
 void rcu_qs(void);
 
 static inline void rcu_softirq_qs(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 6a65d3a16dbd..2f787b9029d1 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,7 +34,6 @@ static inline void rcu_virt_note_context_switch(int cpu)
 
 void synchronize_rcu_expedited(void);
 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
-void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
 
 void rcu_barrier(void);
 bool rcu_eqs_special_set(int cpu);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index c1e25fd10f2a..da94b89cd531 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -593,7 +593,6 @@ rcu_perf_shutdown(void *arg)
 torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
 torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
 torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
-torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu().");
 
 static struct task_struct **kfree_reader_tasks;
 static int kfree_nrealthreads;
@@ -632,14 +631,7 @@ kfree_perf_thread(void *arg)
 			if (!alloc_ptr)
 				return -ENOMEM;
 
-			if (!kfree_no_batch) {
-				kfree_rcu(alloc_ptr, rh);
-			} else {
-				rcu_callback_t cb;
-
-				cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
-				kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb);
-			}
+			kfree_rcu(alloc_ptr, rh);
 		}
 
 		cond_resched();
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a8dd612098bf..31d2d9255d95 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2763,8 +2763,10 @@ static void kfree_rcu_work(struct work_struct *work)
 		rcu_lock_acquire(&rcu_callback_map);
 		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
 
-		/* Could be possible to optimize with kfree_bulk in future */
-		kfree((void *)head - offset);
+		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
+			/* Could be optimized with kfree_bulk() in future. */
+			kfree((void *)head - offset);
+		}
 
 		rcu_lock_release(&rcu_callback_map);
 		cond_resched_tasks_rcu_qs();
@@ -2835,16 +2837,6 @@ static void kfree_rcu_monitor(struct work_struct *work)
 		spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
-/*
- * This version of kfree_call_rcu does not do batching of kfree_rcu() requests.
- * Used only by rcuperf torture test for comparison with kfree_rcu_batch().
- */
-void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
-{
-	__call_rcu(head, func);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
-
 /*
  * Queue a request for lazy invocation of kfree() after a grace period.
  *
@@ -2864,8 +2856,6 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
 
-	head->func = func;
-
 	local_irq_save(flags);	// For safely calling this_cpu_ptr().
 	krcp = this_cpu_ptr(&krc);
 	if (krcp->initialized)

From c130d2dc93cd03323494d82dbe7b5fb0d101ab62 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshanlai@gmail.com>
Date: Tue, 15 Oct 2019 10:28:48 +0000
Subject: [PATCH 54/69] rcu: Rename some instance of CONFIG_PREEMPTION to
 CONFIG_PREEMPT_RCU

CONFIG_PREEMPTION and CONFIG_PREEMPT_RCU are always identical,
but some code depends on CONFIG_PREEMPTION to access to
rcu_preempt functionality. This patch changes CONFIG_PREEMPTION
to CONFIG_PREEMPT_RCU in these cases.

Signed-off-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c       | 4 ++--
 kernel/rcu/tree_stall.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c9dbb05e4c13..5445da2326a0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1934,7 +1934,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 	struct rcu_node *rnp_p;
 
 	raw_lockdep_assert_held_rcu_node(rnp);
-	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
+	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
 	    WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
 	    rnp->qsmask != 0) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2294,7 +2294,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 		mask = 0;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask == 0) {
-			if (!IS_ENABLED(CONFIG_PREEMPTION) ||
+			if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
 			    rcu_preempt_blocked_readers_cgp(rnp)) {
 				/*
 				 * No point in scanning bits because they
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index c0b8c458d8a6..a6652efedd48 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)
 //
 // Printing RCU CPU stall warnings
 
-#ifdef CONFIG_PREEMPTION
+#ifdef CONFIG_PREEMPT_RCU
 
 /*
  * Dump detailed information for all tasks blocking the current RCU
@@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 	return ndetected;
 }
 
-#else /* #ifdef CONFIG_PREEMPTION */
+#else /* #ifdef CONFIG_PREEMPT_RCU */
 
 /*
  * Because preemptible RCU does not exist, we never have to check for
@@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 {
 	return 0;
 }
-#endif /* #else #ifdef CONFIG_PREEMPTION */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /*
  * Dump stacks of all tasks running on stalled CPUs.  First try using

From 2eeba5838fd8c5e19bb91e25624116936348e7af Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Fri, 1 Nov 2019 04:06:22 -0700
Subject: [PATCH 55/69] rcu: Clear .exp_hint only when deferred quiescent state
 has been reported

Currently, the .exp_hint flag is cleared in rcu_read_unlock_special(),
which works, but which can also prevent subsequent rcu_read_unlock() calls
from helping expedite the quiescent state needed by an ongoing expedited
RCU grace period.  This commit therefore defers clearing of .exp_hint
from rcu_read_unlock_special() to rcu_preempt_deferred_qs_irqrestore(),
thus ensuring that intervening calls to rcu_read_unlock() have a chance
to help end the expedited grace period.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8cdce111ea73..7487c7930a47 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -444,6 +444,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		local_irq_restore(flags);
 		return;
 	}
+	t->rcu_read_unlock_special.b.exp_hint = false;
 	t->rcu_read_unlock_special.b.deferred_qs = false;
 	if (special.b.need_qs) {
 		rcu_qs();
@@ -610,7 +611,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 		struct rcu_node *rnp = rdp->mynode;
 
-		t->rcu_read_unlock_special.b.exp_hint = false;
 		exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
 		      (rdp->grpmask & rnp->expmask) ||
 		      tick_nohz_full_cpu(rdp->cpu);
@@ -640,7 +640,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		local_irq_restore(flags);
 		return;
 	}
-	WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
 	rcu_preempt_deferred_qs_irqrestore(t, flags);
 }
 

From 3717e1e9f25ec7059e421ab6fc602cab7063c11c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Fri, 1 Nov 2019 05:06:21 -0700
Subject: [PATCH 56/69] rcu: Clear ->rcu_read_unlock_special only once

In rcu_preempt_deferred_qs_irqrestore(), ->rcu_read_unlock_special is
cleared one piece at a time.  Given that the "if" statements in this
function use the copy in "special", this commit removes the clearing
of the individual pieces in favor of clearing ->rcu_read_unlock_special
in one go just after it has been determined to be non-zero.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7487c7930a47..c3a32717c42d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -444,16 +444,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		local_irq_restore(flags);
 		return;
 	}
-	t->rcu_read_unlock_special.b.exp_hint = false;
-	t->rcu_read_unlock_special.b.deferred_qs = false;
-	if (special.b.need_qs) {
+	t->rcu_read_unlock_special.s = 0;
+	if (special.b.need_qs)
 		rcu_qs();
-		t->rcu_read_unlock_special.b.need_qs = false;
-		if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) {
-			local_irq_restore(flags);
-			return;
-		}
-	}
 
 	/*
 	 * Respond to a request by an expedited grace period for a
@@ -461,17 +454,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	 * tasks are handled when removing the task from the
 	 * blocked-tasks list below.
 	 */
-	if (rdp->exp_deferred_qs) {
+	if (rdp->exp_deferred_qs)
 		rcu_report_exp_rdp(rdp);
-		if (!t->rcu_read_unlock_special.s) {
-			local_irq_restore(flags);
-			return;
-		}
-	}
 
 	/* Clean up if blocked during RCU read-side critical section. */
 	if (special.b.blocked) {
-		t->rcu_read_unlock_special.b.blocked = false;
 
 		/*
 		 * Remove this task from the list it blocked on.  The task

From c51f83c315c392d9776c33eb16a2fe1349d65c7f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 4 Nov 2019 08:22:45 -0800
Subject: [PATCH 57/69] rcu: Use READ_ONCE() for ->expmask in
 rcu_read_unlock_special()

The rcu_node structure's ->expmask field is updated only when holding the
->lock, but is also accessed locklessly.  This means that all ->expmask
updates must use WRITE_ONCE() and all reads carried out without holding
->lock must use READ_ONCE().  This commit therefore changes the lockless
->expmask read in rcu_read_unlock_special() to use READ_ONCE().

Reported-by: syzbot+99f4ddade3c22ab0cf23@syzkaller.appspotmail.com
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Marco Elver <elver@google.com>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c3a32717c42d..698a7f195f88 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -599,7 +599,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		struct rcu_node *rnp = rdp->mynode;
 
 		exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
-		      (rdp->grpmask & rnp->expmask) ||
+		      (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
 		      tick_nohz_full_cpu(rdp->cpu);
 		// Need to defer quiescent state until everything is enabled.
 		if (irqs_were_disabled && use_softirq &&

From 77339e61aa309310a535bd01eb3388f7a27b36f9 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Fri, 15 Nov 2019 14:08:53 -0800
Subject: [PATCH 58/69] rcu: Provide wrappers for uses of
 ->rcu_read_lock_nesting

This commit provides wrapper functions for uses of ->rcu_read_lock_nesting
to improve readability and to ease future changes to support inlining
of __rcu_read_lock() and __rcu_read_unlock().

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h    |  4 +--
 kernel/rcu/tree_plugin.h | 53 ++++++++++++++++++++++++++--------------
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 98d078cafa5a..d8da6b1a3209 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -610,7 +610,7 @@ static void rcu_exp_handler(void *unused)
 	 * critical section.  If also enabled or idle, immediately
 	 * report the quiescent state, otherwise defer.
 	 */
-	if (!t->rcu_read_lock_nesting) {
+	if (!rcu_preempt_depth()) {
 		if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
 		    rcu_dynticks_curr_cpu_in_eqs()) {
 			rcu_report_exp_rdp(rdp);
@@ -634,7 +634,7 @@ static void rcu_exp_handler(void *unused)
 	 * can have caused this quiescent state to already have been
 	 * reported, so we really do need to check ->expmask.
 	 */
-	if (t->rcu_read_lock_nesting > 0) {
+	if (rcu_preempt_depth() > 0) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->expmask & rdp->grpmask) {
 			rdp->exp_deferred_qs = true;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 698a7f195f88..ebdbdec5911f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -290,8 +290,8 @@ void rcu_note_context_switch(bool preempt)
 
 	trace_rcu_utilization(TPS("Start context switch"));
 	lockdep_assert_irqs_disabled();
-	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
-	if (t->rcu_read_lock_nesting > 0 &&
+	WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
+	if (rcu_preempt_depth() > 0 &&
 	    !t->rcu_read_unlock_special.b.blocked) {
 
 		/* Possibly blocking in an RCU read-side critical section. */
@@ -348,6 +348,21 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 #define RCU_NEST_NMAX (-INT_MAX / 2)
 #define RCU_NEST_PMAX (INT_MAX / 2)
 
+static void rcu_preempt_read_enter(void)
+{
+	current->rcu_read_lock_nesting++;
+}
+
+static void rcu_preempt_read_exit(void)
+{
+	current->rcu_read_lock_nesting--;
+}
+
+static void rcu_preempt_depth_set(int val)
+{
+	current->rcu_read_lock_nesting = val;
+}
+
 /*
  * Preemptible RCU implementation for rcu_read_lock().
  * Just increment ->rcu_read_lock_nesting, shared state will be updated
@@ -355,9 +370,9 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
  */
 void __rcu_read_lock(void)
 {
-	current->rcu_read_lock_nesting++;
+	rcu_preempt_read_enter();
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
-		WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX);
+		WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
 	barrier();  /* critical section after entry code. */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -373,19 +388,19 @@ void __rcu_read_unlock(void)
 {
 	struct task_struct *t = current;
 
-	if (t->rcu_read_lock_nesting != 1) {
-		--t->rcu_read_lock_nesting;
+	if (rcu_preempt_depth() != 1) {
+		rcu_preempt_read_exit();
 	} else {
 		barrier();  /* critical section before exit code. */
-		t->rcu_read_lock_nesting = -RCU_NEST_BIAS;
+		rcu_preempt_depth_set(-RCU_NEST_BIAS);
 		barrier();  /* assign before ->rcu_read_unlock_special load */
 		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
 			rcu_read_unlock_special(t);
 		barrier();  /* ->rcu_read_unlock_special load before assign */
-		t->rcu_read_lock_nesting = 0;
+		rcu_preempt_depth_set(0);
 	}
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
-		int rrln = t->rcu_read_lock_nesting;
+		int rrln = rcu_preempt_depth();
 
 		WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX);
 	}
@@ -539,7 +554,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 {
 	return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
 		READ_ONCE(t->rcu_read_unlock_special.s)) &&
-	       t->rcu_read_lock_nesting <= 0;
+	       rcu_preempt_depth() <= 0;
 }
 
 /*
@@ -552,16 +567,16 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 static void rcu_preempt_deferred_qs(struct task_struct *t)
 {
 	unsigned long flags;
-	bool couldrecurse = t->rcu_read_lock_nesting >= 0;
+	bool couldrecurse = rcu_preempt_depth() >= 0;
 
 	if (!rcu_preempt_need_deferred_qs(t))
 		return;
 	if (couldrecurse)
-		t->rcu_read_lock_nesting -= RCU_NEST_BIAS;
+		rcu_preempt_depth_set(rcu_preempt_depth() - RCU_NEST_BIAS);
 	local_irq_save(flags);
 	rcu_preempt_deferred_qs_irqrestore(t, flags);
 	if (couldrecurse)
-		t->rcu_read_lock_nesting += RCU_NEST_BIAS;
+		rcu_preempt_depth_set(rcu_preempt_depth() + RCU_NEST_BIAS);
 }
 
 /*
@@ -672,7 +687,7 @@ static void rcu_flavor_sched_clock_irq(int user)
 	if (user || rcu_is_cpu_rrupt_from_idle()) {
 		rcu_note_voluntary_context_switch(current);
 	}
-	if (t->rcu_read_lock_nesting > 0 ||
+	if (rcu_preempt_depth() > 0 ||
 	    (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
 		/* No QS, force context switch if deferred. */
 		if (rcu_preempt_need_deferred_qs(t)) {
@@ -682,13 +697,13 @@ static void rcu_flavor_sched_clock_irq(int user)
 	} else if (rcu_preempt_need_deferred_qs(t)) {
 		rcu_preempt_deferred_qs(t); /* Report deferred QS. */
 		return;
-	} else if (!t->rcu_read_lock_nesting) {
+	} else if (!rcu_preempt_depth()) {
 		rcu_qs(); /* Report immediate QS. */
 		return;
 	}
 
 	/* If GP is oldish, ask for help from rcu_read_unlock_special(). */
-	if (t->rcu_read_lock_nesting > 0 &&
+	if (rcu_preempt_depth() > 0 &&
 	    __this_cpu_read(rcu_data.core_needs_qs) &&
 	    __this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
 	    !t->rcu_read_unlock_special.b.need_qs &&
@@ -709,11 +724,11 @@ void exit_rcu(void)
 	struct task_struct *t = current;
 
 	if (unlikely(!list_empty(&current->rcu_node_entry))) {
-		t->rcu_read_lock_nesting = 1;
+		rcu_preempt_depth_set(1);
 		barrier();
 		WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
-	} else if (unlikely(t->rcu_read_lock_nesting)) {
-		t->rcu_read_lock_nesting = 1;
+	} else if (unlikely(rcu_preempt_depth())) {
+		rcu_preempt_depth_set(1);
 	} else {
 		return;
 	}

From 5b14557b073c96a7cf79adc4d7b6c4a8c26b2a43 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 Nov 2019 18:05:45 -0800
Subject: [PATCH 59/69] rcu: Avoid tick_dep_set_cpu() misordering

In the current code, rcu_nmi_enter_common() might decide to turn on
the tick using tick_dep_set_cpu(), but be delayed just before doing so.
Then the grace-period kthread might notice that the CPU in question had
in fact gone through a quiescent state, thus turning off the tick using
tick_dep_clear_cpu().  The later invocation of tick_dep_set_cpu() would
then incorrectly leave the tick on.

This commit therefore enlists the aid of the leaf rcu_node structure's
->lock to ensure that decisions to enable or disable the tick are
carried out before they can be reversed.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5445da2326a0..b0e0612392a9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -800,8 +800,8 @@ void rcu_user_exit(void)
  */
 static __always_inline void rcu_nmi_enter_common(bool irq)
 {
-	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	long incby = 2;
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
 	/* Complain about underflow. */
 	WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
@@ -828,8 +828,13 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
 	} else if (tick_nohz_full_cpu(rdp->cpu) &&
 		   rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
 		   READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
-		rdp->rcu_forced_tick = true;
-		tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+		raw_spin_lock_rcu_node(rdp->mynode);
+		// Recheck under lock.
+		if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+			rdp->rcu_forced_tick = true;
+			tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+		}
+		raw_spin_unlock_rcu_node(rdp->mynode);
 	}
 	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
 			  rdp->dynticks_nmi_nesting,
@@ -898,6 +903,7 @@ void rcu_irq_enter_irqson(void)
  */
 static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
 {
+	raw_lockdep_assert_held_rcu_node(rdp->mynode);
 	WRITE_ONCE(rdp->rcu_urgent_qs, false);
 	WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
 	if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {

From 822175e72995a9ff7eeef4f5cd3f945f2697b67d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshanlai@gmail.com>
Date: Tue, 15 Oct 2019 10:23:56 +0000
Subject: [PATCH 60/69] rcu: Fix harmless omission of "CONFIG_" from #if
 condition

The C preprocessor macros SRCU and TINY_RCU should instead be CONFIG_SRCU
and CONFIG_TINY_RCU, respectively in the #f in kernel/rcu/rcu.h. But
there is no harm when "TINY_RCU" is wrongly used, which are always
non-defined, which makes "!defined(TINY_RCU)" always true, which means
the code block is always included, and the included code block doesn't
cause any compilation error so far in CONFIG_TINY_RCU builds.  It is
also the reason this change should not be taken in -stable.

This commit adds the needed "CONFIG_" prefix to both macros.

Not for -stable.

Signed-off-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ab504fbc76ca..473259422547 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -281,7 +281,7 @@ void rcu_test_sync_prims(void);
  */
 extern void resched_cpu(int cpu);
 
-#if defined(SRCU) || !defined(TINY_RCU)
+#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
 
 #include <linux/rcu_node_tree.h>
 
@@ -418,7 +418,7 @@ do {									\
 #define raw_lockdep_assert_held_rcu_node(p)				\
 	lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
 
-#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
 
 #ifdef CONFIG_SRCU
 void srcu_init(void);

From 2488a5e695564897a0f8ea42cd3af71647cd26d2 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshanlai@gmail.com>
Date: Tue, 15 Oct 2019 10:23:57 +0000
Subject: [PATCH 61/69] rcu: Fix tracepoint tracking RCU CPU kthread
 utilization

In the call to trace_rcu_utilization() at the start of the loop in
rcu_cpu_kthread(), "rcu_wait" is incorrect, plus this trace event needs
to be hoisted above the loop to balance with either the "rcu_wait" or
"rcu_yield", depending on how the loop exits.  This commit therefore
makes these changes.

Signed-off-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index dd8cfc34f4da..ba154a3080fb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2474,8 +2474,8 @@ static void rcu_cpu_kthread(unsigned int cpu)
 	char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
 	int spincnt;
 
+	trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
 	for (spincnt = 0; spincnt < 10; spincnt++) {
-		trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
 		local_bh_disable();
 		*statusp = RCU_KTHREAD_RUNNING;
 		local_irq_disable();

From 4778339df0ee361dbf2cbdcb87abaec9dfbc841d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshanlai@gmail.com>
Date: Tue, 15 Oct 2019 10:28:46 +0000
Subject: [PATCH 62/69] rcu: Remove the declaration of call_rcu() in tree.h

The call_rcu() function is an external RCU API that is declared in
include/linux/rcupdate.h.  There is thus no point in redeclaring it
in kernel/rcu/tree.h, so this commit removes that redundant declaration.

Signed-off-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e4dc5debfc84..54ff9896ae31 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -413,7 +413,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 static void rcu_flavor_sched_clock_irq(int user);
-void call_rcu(struct rcu_head *head, rcu_callback_t func);
 static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);

From e2167b38c87a0c9e85c342a823dae1e6f67b11d9 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshanlai@gmail.com>
Date: Tue, 15 Oct 2019 10:28:47 +0000
Subject: [PATCH 63/69] rcu: Move gp_state_names[] and gp_state_getname() to
 tree_stall.h

Only tree_stall.h needs to get name from GP state, so this commit
moves the gp_state_names[] array and the gp_state_getname()
from kernel/rcu/tree.h and kernel/rcu/tree.c, respectively, to
kernel/rcu/tree_stall.h.  While moving gp_state_names[], this commit
uses the GCC syntax to ensure that the right string is associated with
the right CPP macro.

Signed-off-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c       | 10 ----------
 kernel/rcu/tree.h       | 12 ------------
 kernel/rcu/tree_stall.h | 22 ++++++++++++++++++++++
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ba154a3080fb..bbb60ed310b1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -528,16 +528,6 @@ static struct rcu_node *rcu_get_root(void)
 	return &rcu_state.node[0];
 }
 
-/*
- * Convert a ->gp_state value to a character string.
- */
-static const char *gp_state_getname(short gs)
-{
-	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
-		return "???";
-	return gp_state_names[gs];
-}
-
 /*
  * Send along grace-period-related data for rcutorture diagnostics.
  */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 54ff9896ae31..9d5986abfc67 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -368,18 +368,6 @@ struct rcu_state {
 #define RCU_GP_CLEANUP   7	/* Grace-period cleanup started. */
 #define RCU_GP_CLEANED   8	/* Grace-period cleanup complete. */
 
-static const char * const gp_state_names[] = {
-	"RCU_GP_IDLE",
-	"RCU_GP_WAIT_GPS",
-	"RCU_GP_DONE_GPS",
-	"RCU_GP_ONOFF",
-	"RCU_GP_INIT",
-	"RCU_GP_WAIT_FQS",
-	"RCU_GP_DOING_FQS",
-	"RCU_GP_CLEANUP",
-	"RCU_GP_CLEANED",
-};
-
 /*
  * In order to export the rcu_state name to the tracing tools, it
  * needs to be added in the __tracepoint_string section.
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index c0b8c458d8a6..f18adaf3bf39 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -279,6 +279,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 
 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
 
+static const char * const gp_state_names[] = {
+	[RCU_GP_IDLE] = "RCU_GP_IDLE",
+	[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
+	[RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS",
+	[RCU_GP_ONOFF] = "RCU_GP_ONOFF",
+	[RCU_GP_INIT] = "RCU_GP_INIT",
+	[RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS",
+	[RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS",
+	[RCU_GP_CLEANUP] = "RCU_GP_CLEANUP",
+	[RCU_GP_CLEANED] = "RCU_GP_CLEANED",
+};
+
+/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+		return "???";
+	return gp_state_names[gs];
+}
+
 /*
  * Print out diagnostic information for the specified stalled CPU.
  *

From e1350e8e0ea5d959c23c5e593ff3026a67dbb049 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Tue, 15 Oct 2019 14:48:22 +0100
Subject: [PATCH 64/69] rcu: Move rcu_{expedited,normal} definitions into
 rcupdate.h

This commit moves the rcu_{expedited,normal} definitions from
kernel/rcu/update.c to include/linux/rcupdate.h to make sure they are
in sync, and also to avoid the following warning from sparse:

kernel/ksysfs.c:150:5: warning: symbol 'rcu_expedited' was not declared. Should it be static?
kernel/ksysfs.c:167:5: warning: symbol 'rcu_normal' was not declared. Should it be static?

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 4 ++++
 kernel/rcu/update.c      | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index fe470243acdd..bb36379606d0 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -896,4 +896,8 @@ rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
 	return false;
 }
 
+/* kernel/ksysfs.c definitions */
+extern int rcu_expedited;
+extern int rcu_normal;
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1861103662db..294d357abd0c 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,9 +51,7 @@
 #define MODULE_PARAM_PREFIX "rcupdate."
 
 #ifndef CONFIG_TINY_RCU
-extern int rcu_expedited; /* from sysctl */
 module_param(rcu_expedited, int, 0);
-extern int rcu_normal; /* from sysctl */
 module_param(rcu_normal, int, 0);
 static int rcu_normal_after_boot;
 module_param(rcu_normal_after_boot, int, 0);

From 7441e7661d6586ae36329b7956e4d713d81e9903 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 30 Oct 2019 09:37:11 -0700
Subject: [PATCH 65/69] rcu: Switch force_qs_rnp() to
 for_each_leaf_node_cpu_mask()

Currently, force_qs_rnp() uses a for_each_leaf_node_possible_cpu()
loop containing a check of the current CPU's bit in ->qsmask.
This works, but this commit saves three lines by instead using
for_each_leaf_node_cpu_mask(), which combines the functionality of
for_each_leaf_node_possible_cpu() and leaf_node_cpu_bit().  This commit
also replaces the use of the local variable "bit" with rdp->grpmask.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bbb60ed310b1..d95076498488 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2298,14 +2298,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			continue;
 		}
-		for_each_leaf_node_possible_cpu(rnp, cpu) {
-			unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
-			if ((rnp->qsmask & bit) != 0) {
-				rdp = per_cpu_ptr(&rcu_data, cpu);
-				if (f(rdp)) {
-					mask |= bit;
-					rcu_disable_urgency_upon_qs(rdp);
-				}
+		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+			rdp = per_cpu_ptr(&rcu_data, cpu);
+			if (f(rdp)) {
+				mask |= rdp->grpmask;
+				rcu_disable_urgency_upon_qs(rdp);
 			}
 		}
 		if (mask != 0) {

From 844a378de3372c923909681706d62336d702531e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 4 Nov 2019 08:08:30 -0800
Subject: [PATCH 66/69] srcu: Apply *_ONCE() to ->srcu_last_gp_end

The ->srcu_last_gp_end field is accessed from any CPU at any time
by synchronize_srcu(), so non-initialization references need to use
READ_ONCE() and WRITE_ONCE().  This commit therefore makes that change.

Reported-by: syzbot+08f3e9d26e5541e1ecf2@syzkaller.appspotmail.com
Acked-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 5dffade2d7cd..21acdff3bd27 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -530,7 +530,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	idx = rcu_seq_state(ssp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
 	cbdelay = srcu_get_delay(ssp);
-	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+	WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
 	rcu_seq_end(&ssp->srcu_gp_seq);
 	gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
 	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
@@ -762,6 +762,7 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
 	unsigned long flags;
 	struct srcu_data *sdp;
 	unsigned long t;
+	unsigned long tlast;
 
 	/* If the local srcu_data structure has callbacks, not idle.  */
 	local_irq_save(flags);
@@ -780,9 +781,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
 
 	/* First, see if enough time has passed since the last GP. */
 	t = ktime_get_mono_fast_ns();
+	tlast = READ_ONCE(ssp->srcu_last_gp_end);
 	if (exp_holdoff == 0 ||
-	    time_in_range_open(t, ssp->srcu_last_gp_end,
-			       ssp->srcu_last_gp_end + exp_holdoff))
+	    time_in_range_open(t, tlast, tlast + exp_holdoff))
 		return false; /* Too soon after last GP. */
 
 	/* Next, check for probable idleness. */

From 05d35961c20b8919bb94df6f626f306553758d59 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 Nov 2019 15:35:26 -0800
Subject: [PATCH 67/69] .mailmap: Add entries for old paulmck@kernel.org
 addresses

[ paulmck: Apply Florian Fainelli feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .mailmap | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.mailmap b/.mailmap
index c24773db04a7..39efbe974395 100644
--- a/.mailmap
+++ b/.mailmap
@@ -207,6 +207,10 @@ Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
 Patrick Mochel <mochel@digitalimplant.org>
 Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
 Paul Burton <paulburton@kernel.org> <paul.burton@mips.com>
+Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.ibm.com>
+Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.vnet.ibm.com>
+Paul E. McKenney <paulmck@kernel.org> <paul.mckenney@linaro.org>
+Paul E. McKenney <paulmck@kernel.org> <paulmck@us.ibm.com>
 Peter A Jonsson <pj@ludd.ltu.se>
 Peter Oruba <peter@oruba.de>
 Peter Oruba <peter.oruba@amd.com>

From c7e9c01f928a0e7e467ec3efe1b56bc566678bae Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 8 Nov 2019 17:01:18 +0000
Subject: [PATCH 68/69] powerpc: Remove comment about read_barrier_depends()

'read_barrier_depends()' doesn't exist anymore so stop talking about it.

Signed-off-by: Will Deacon <will@kernel.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 arch/powerpc/include/asm/barrier.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index fbe8df433019..123adcefd40f 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -18,8 +18,6 @@
  * mb() prevents loads and stores being reordered across this point.
  * rmb() prevents loads being reordered across this point.
  * wmb() prevents stores being reordered across this point.
- * read_barrier_depends() prevents data-dependent loads being reordered
- *	across this point (nop on PPC).
  *
  * *mb() variants without smp_ prefix must order all types of memory
  * operations with one another. sync is the only instruction sufficient

From f6105fc2a9c0ea5be6b9e5c19b2551af0b8b4eac Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 27 Nov 2019 11:36:07 -0800
Subject: [PATCH 69/69] rcu: Remove unused stop-machine #include

Long ago, RCU used the stop-machine mechanism to implement expedited
grace periods, but no longer does so.  This commit therefore removes
the no-longer-needed #includes of linux/stop_machine.h.

Link: https://lwn.net/Articles/805317/
Reported-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 1 -
 kernel/rcu/tree.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d95076498488..878f62f218e9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -43,7 +43,6 @@
 #include <uapi/linux/sched/types.h>
 #include <linux/prefetch.h>
 #include <linux/delay.h>
-#include <linux/stop_machine.h>
 #include <linux/random.h>
 #include <linux/trace_events.h>
 #include <linux/suspend.h>
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9d5986abfc67..ce90c68c184b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -16,7 +16,6 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 #include <linux/swait.h>
-#include <linux/stop_machine.h>
 #include <linux/rcu_node_tree.h>
 
 #include "rcu_segcblist.h"