Commit 52d7e48b authored by Paul E. McKenney's avatar Paul E. McKenney

rcu: Narrow early boot window of illegal synchronous grace periods

The current preemptible RCU implementation goes through three phases
during bootup.  In the first phase, there is only one CPU that is running
with preemption disabled, so that a no-op is a synchronous grace period.
In the second mid-boot phase, the scheduler is running, but RCU has
not yet gotten its kthreads spawned (and, for expedited grace periods,
workqueues are not yet running.  During this time, any attempt to do
a synchronous grace period will hang the system (or complain bitterly,
depending).  In the third and final phase, RCU is fully operational and
everything works normally.

This has been OK for some time, but there has recently been some
synchronous grace periods showing up during the second mid-boot phase.
This code worked "by accident" for awhile, but started failing as soon
as expedited RCU grace periods switched over to workqueues in commit
8b355e3b ("rcu: Drive expedited grace periods from workqueue").
Note that the code was buggy even before this commit, as it was subject
to failure on real-time systems that forced all expedited grace periods
to run as normal grace periods (for example, using the rcu_normal ksysfs
parameter).  The callchain from the failure case is as follows:

early_amd_iommu_init()
|-> acpi_put_table(ivrs_base);
|-> acpi_tb_put_table(table_desc);
|-> acpi_tb_invalidate_table(table_desc);
|-> acpi_tb_release_table(...)
|-> acpi_os_unmap_memory
|-> acpi_os_unmap_iomem
|-> acpi_os_map_cleanup
|-> synchronize_rcu_expedited

The kernel showing this callchain was built with CONFIG_PREEMPT_RCU=y,
which caused the code to try using workqueues before they were
initialized, which did not go well.

This commit therefore reworks RCU to permit synchronous grace periods
to proceed during this mid-boot phase.  This commit is therefore a
fix to a regression introduced in v4.9, and is therefore being put
forward post-merge-window in v4.10.

This commit sets a flag from the existing rcu_scheduler_starting()
function which causes all synchronous grace periods to take the expedited
path.  The expedited path now checks this flag, using the requesting task
to drive the expedited grace period forward during the mid-boot phase.
Finally, this flag is updated by a core_initcall() function named
rcu_exp_runtime_mode(), which causes the runtime codepaths to be used.

Note that this arrangement assumes that tasks are not sent POSIX signals
(or anything similar) from the time that the first task is spawned
through core_initcall() time.

Fixes: 8b355e3b ("rcu: Drive expedited grace periods from workqueue")
Reported-by: default avatar"Zheng, Lv" <lv.zheng@intel.com>
Reported-by: default avatarBorislav Petkov <bp@alien8.de>
Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: default avatarStan Kain <stan.kain@gmail.com>
Tested-by: default avatarIvan <waffolz@hotmail.com>
Tested-by: default avatarEmanuel Castelo <emanuel.castelo@gmail.com>
Tested-by: default avatarBruno Pesavento <bpesavento@infinito.it>
Tested-by: default avatarBorislav Petkov <bp@suse.de>
Tested-by: default avatarFrederic Bezies <fredbezies@gmail.com>
Cc: <stable@vger.kernel.org> # 4.9.0-
parent f466ae66
...@@ -444,6 +444,10 @@ bool __rcu_is_watching(void); ...@@ -444,6 +444,10 @@ bool __rcu_is_watching(void);
#error "Unknown RCU implementation specified to kernel configuration" #error "Unknown RCU implementation specified to kernel configuration"
#endif #endif
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
/* /*
* init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
* initialization and destruction of rcu_head on the stack. rcu_head structures * initialization and destruction of rcu_head on the stack. rcu_head structures
......
...@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void); ...@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
#define TPS(x) tracepoint_string(x) #define TPS(x) tracepoint_string(x)
void rcu_early_boot_tests(void); void rcu_early_boot_tests(void);
void rcu_test_sync_prims(void);
/* /*
* This function really isn't for public consumption, but RCU is special in * This function really isn't for public consumption, but RCU is special in
......
...@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); ...@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
/* /*
* During boot, we forgive RCU lockdep issues. After this function is * During boot, we forgive RCU lockdep issues. After this function is
* invoked, we start taking RCU lockdep issues seriously. * invoked, we start taking RCU lockdep issues seriously. Note that unlike
* Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
* to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
* The reason for this is that Tiny RCU does not need kthreads, so does
* not have to care about the fact that the scheduler is half-initialized
* at a certain phase of the boot process.
*/ */
void __init rcu_scheduler_starting(void) void __init rcu_scheduler_starting(void)
{ {
WARN_ON(nr_context_switches() > 0); WARN_ON(nr_context_switches() > 0);
rcu_scheduler_active = 1; rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
} }
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
......
...@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ ...@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
int sysctl_panic_on_rcu_stall __read_mostly; int sysctl_panic_on_rcu_stall __read_mostly;
/* /*
* The rcu_scheduler_active variable transitions from zero to one just * The rcu_scheduler_active variable is initialized to the value
* before the first task is spawned. So when this variable is zero, RCU * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
* can assume that there is but one task, allowing RCU to (for example) * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
* RCU can assume that there is but one task, allowing RCU to (for example)
* optimize synchronize_rcu() to a simple barrier(). When this variable * optimize synchronize_rcu() to a simple barrier(). When this variable
* is one, RCU must actually do all the hard work required to detect real * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
* grace periods. This variable is also used to suppress boot-time false * to detect real grace periods. This variable is also used to suppress
* positives from lockdep-RCU error checking. * boot-time false positives from lockdep-RCU error checking. Finally, it
* transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
* is fully initialized, including all of its kthreads having been spawned.
*/ */
int rcu_scheduler_active __read_mostly; int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active); EXPORT_SYMBOL_GPL(rcu_scheduler_active);
...@@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void) ...@@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void)
early_initcall(rcu_spawn_gp_kthread); early_initcall(rcu_spawn_gp_kthread);
/* /*
* This function is invoked towards the end of the scheduler's initialization * This function is invoked towards the end of the scheduler's
* process. Before this is called, the idle task might contain * initialization process. Before this is called, the idle task might
* RCU read-side critical sections (during which time, this idle * contain synchronous grace-period primitives (during which time, this idle
* task is booting the system). After this function is called, the * task is booting the system, and such primitives are no-ops). After this
* idle tasks are prohibited from containing RCU read-side critical * function is called, any synchronous grace-period primitives are run as
* sections. This function also enables RCU lockdep checking. * expedited, with the requesting task driving the grace period forward.
* A later core_initcall() rcu_exp_runtime_mode() will switch to full
* runtime RCU functionality.
*/ */
void rcu_scheduler_starting(void) void rcu_scheduler_starting(void)
{ {
WARN_ON(num_online_cpus() != 1); WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0); WARN_ON(nr_context_switches() > 0);
rcu_scheduler_active = 1; rcu_test_sync_prims();
rcu_scheduler_active = RCU_SCHEDULER_INIT;
rcu_test_sync_prims();
} }
/* /*
......
...@@ -531,6 +531,20 @@ struct rcu_exp_work { ...@@ -531,6 +531,20 @@ struct rcu_exp_work {
struct work_struct rew_work; struct work_struct rew_work;
}; };
/*
* Common code to drive an expedited grace period forward, used by
* workqueues and mid-boot-time tasks.
*/
static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
smp_call_func_t func, unsigned long s)
{
/* Initialize the rcu_node tree in preparation for the wait. */
sync_rcu_exp_select_cpus(rsp, func);
/* Wait and clean up, including waking everyone. */
rcu_exp_wait_wake(rsp, s);
}
/* /*
* Work-queue handler to drive an expedited grace period forward. * Work-queue handler to drive an expedited grace period forward.
*/ */
...@@ -538,12 +552,8 @@ static void wait_rcu_exp_gp(struct work_struct *wp) ...@@ -538,12 +552,8 @@ static void wait_rcu_exp_gp(struct work_struct *wp)
{ {
struct rcu_exp_work *rewp; struct rcu_exp_work *rewp;
/* Initialize the rcu_node tree in preparation for the wait. */
rewp = container_of(wp, struct rcu_exp_work, rew_work); rewp = container_of(wp, struct rcu_exp_work, rew_work);
sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
/* Wait and clean up, including waking everyone. */
rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
} }
/* /*
...@@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, ...@@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
if (exp_funnel_lock(rsp, s)) if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */ return; /* Someone else did our work for us. */
/* Marshall arguments and schedule the expedited grace period. */ /* Ensure that load happens before action based on it. */
if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(rsp, func, s);
} else {
/* Marshall arguments & schedule the expedited grace period. */
rew.rew_func = func; rew.rew_func = func;
rew.rew_rsp = rsp; rew.rew_rsp = rsp;
rew.rew_s = s; rew.rew_s = s;
INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
schedule_work(&rew.rew_work); schedule_work(&rew.rew_work);
}
/* Wait for expedited grace period to complete. */ /* Wait for expedited grace period to complete. */
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
...@@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void) ...@@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void)
{ {
struct rcu_state *rsp = rcu_state_p; struct rcu_state *rsp = rcu_state_p;
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return;
_synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
} }
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
...@@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void) ...@@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void)
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
* Switch to run-time mode once Tree RCU has fully initialized.
*/
static int __init rcu_exp_runtime_mode(void)
{
rcu_test_sync_prims();
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
rcu_test_sync_prims();
return 0;
}
core_initcall(rcu_exp_runtime_mode);
...@@ -670,7 +670,7 @@ void synchronize_rcu(void) ...@@ -670,7 +670,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map), lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section"); "Illegal synchronize_rcu() in RCU read-side critical section");
if (!rcu_scheduler_active) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return; return;
if (rcu_gp_is_expedited()) if (rcu_gp_is_expedited())
synchronize_rcu_expedited(); synchronize_rcu_expedited();
......
...@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); ...@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
* Should expedited grace-period primitives always fall back to their * Should expedited grace-period primitives always fall back to their
* non-expedited counterparts? Intended for use within RCU. Note * non-expedited counterparts? Intended for use within RCU. Note
* that if the user specifies both rcu_expedited and rcu_normal, then * that if the user specifies both rcu_expedited and rcu_normal, then
* rcu_normal wins. * rcu_normal wins. (Except during the time period during boot from
* when the first task is spawned until the rcu_exp_runtime_mode()
* core_initcall() is invoked, at which point everything is expedited.)
*/ */
bool rcu_gp_is_normal(void) bool rcu_gp_is_normal(void)
{ {
return READ_ONCE(rcu_normal); return READ_ONCE(rcu_normal) &&
rcu_scheduler_active != RCU_SCHEDULER_INIT;
} }
EXPORT_SYMBOL_GPL(rcu_gp_is_normal); EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
...@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting = ...@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting =
/* /*
* Should normal grace-period primitives be expedited? Intended for * Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited * use within RCU. Note that this function takes the rcu_expedited
* sysfs/boot variable into account as well as the rcu_expedite_gp() * sysfs/boot variable and rcu_scheduler_active into account as well
* nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
* returns false is a -really- bad idea. * until rcu_gp_is_expedited() returns false is a -really- bad idea.
*/ */
bool rcu_gp_is_expedited(void) bool rcu_gp_is_expedited(void)
{ {
return rcu_expedited || atomic_read(&rcu_expedited_nesting); return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
rcu_scheduler_active == RCU_SCHEDULER_INIT;
} }
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
...@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map); ...@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
int notrace debug_lockdep_rcu_enabled(void) int notrace debug_lockdep_rcu_enabled(void)
{ {
return rcu_scheduler_active && debug_locks && return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
current->lockdep_recursion == 0; current->lockdep_recursion == 0;
} }
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
...@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); ...@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
void synchronize_rcu_tasks(void) void synchronize_rcu_tasks(void)
{ {
/* Complain if the scheduler has not started. */ /* Complain if the scheduler has not started. */
RCU_LOCKDEP_WARN(!rcu_scheduler_active, RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
"synchronize_rcu_tasks called too soon"); "synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */ /* Wait for the grace period. */
...@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void) ...@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void)
#endif /* #ifdef CONFIG_TASKS_RCU */ #endif /* #ifdef CONFIG_TASKS_RCU */
/*
* Test each non-SRCU synchronous grace-period wait API. This is
* useful just after a change in mode for these primitives, and
* during early boot.
*/
void rcu_test_sync_prims(void)
{
if (!IS_ENABLED(CONFIG_PROVE_RCU))
return;
synchronize_rcu();
synchronize_rcu_bh();
synchronize_sched();
synchronize_rcu_expedited();
synchronize_rcu_bh_expedited();
synchronize_sched_expedited();
}
#ifdef CONFIG_PROVE_RCU #ifdef CONFIG_PROVE_RCU
/* /*
...@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void) ...@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void)
early_boot_test_call_rcu_bh(); early_boot_test_call_rcu_bh();
if (rcu_self_test_sched) if (rcu_self_test_sched)
early_boot_test_call_rcu_sched(); early_boot_test_call_rcu_sched();
rcu_test_sync_prims();
} }
static int rcu_verify_early_boot_tests(void) static int rcu_verify_early_boot_tests(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment