Commit f6a12f34 authored by Paul E. McKenney's avatar Paul E. McKenney

rcu: Enforce expedited-GP fairness via funnel wait queue

The current mutex-based funnel-locking approach used by expedited grace
periods is subject to severe unfairness.  The problem arises when a
few tasks, making a path from leaves to root, all wake up before other
tasks do.  A new task can then follow this path all the way to the root,
which needlessly delays tasks whose grace period is done, but who do
not happen to acquire the lock quickly enough.

This commit avoids this problem by maintaining per-rcu_node wait queues,
along with a per-rcu_node counter that tracks the latest grace period
sought by an earlier task to visit this node.  If that grace period
would satisfy the current task, instead of proceeding up the tree,
it waits on the current rcu_node structure using a pair of wait queues
provided for that purpose.  This decouples awakening of old tasks from
the arrival of new tasks.

If the wakeups prove to be a bottleneck, additional kthreads can be
brought to bear for that purpose.
Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.vnet.ibm.com>
parent d40a4f09
......@@ -179,6 +179,7 @@ TRACE_EVENT(rcu_grace_period_init,
* "snap": Captured snapshot of expedited grace period sequence number.
* "start": Started a real expedited grace period.
* "end": Ended a real expedited grace period.
* "endwake": Woke piggybackers up.
* "done": Someone else did the expedited grace period for us.
*/
TRACE_EVENT(rcu_exp_grace_period,
......@@ -210,8 +211,8 @@ TRACE_EVENT(rcu_exp_grace_period,
* and highest-numbered CPU associated with the current rcu_node structure,
* and a string. identifying the grace-period-related event as follows:
*
* "acq": Acquired a level of funnel lock
* "rel": Released a level of funnel lock
* "nxtlvl": Advance to next level of rcu_node funnel
* "wait": Wait for someone else to do expedited GP
*/
TRACE_EVENT(rcu_exp_funnel_lock,
......
This diff is collapsed.
......@@ -70,7 +70,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
# define RCU_NODE_NAME_INIT { "rcu_node_0" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
......@@ -79,7 +78,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
......@@ -89,7 +87,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
......@@ -100,7 +97,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
......@@ -252,7 +248,9 @@ struct rcu_node {
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
unsigned long exp_seq_rq;
wait_queue_head_t exp_wq[2];
} ____cacheline_internodealigned_in_smp;
/*
......@@ -387,7 +385,6 @@ struct rcu_data {
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
struct mutex exp_funnel_mutex;
atomic_long_t exp_workdone1; /* # done by others #1. */
atomic_long_t exp_workdone2; /* # done by others #2. */
atomic_long_t exp_workdone3; /* # done by others #3. */
......@@ -504,6 +501,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
struct mutex exp_mutex; /* Serialize expedited GP. */
unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
......
......@@ -738,8 +738,6 @@ static void sync_rcu_exp_handler(void *info)
*/
void synchronize_rcu_expedited(void)
{
struct rcu_node *rnp;
struct rcu_node *rnp_unlock;
struct rcu_state *rsp = rcu_state_p;
unsigned long s;
......@@ -752,8 +750,7 @@ void synchronize_rcu_expedited(void)
s = rcu_exp_gp_seq_snap(rsp);
trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
rnp_unlock = exp_funnel_lock(rsp, s);
if (rnp_unlock == NULL)
if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */
rcu_exp_gp_seq_start(rsp);
......@@ -763,16 +760,13 @@ void synchronize_rcu_expedited(void)
sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
/* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp);
synchronize_sched_expedited_wait(rsp);
/* Clean up and exit. */
rcu_exp_gp_seq_end(rsp);
trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
mutex_unlock(&rnp_unlock->exp_funnel_mutex);
trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level,
rnp_unlock->grplo, rnp_unlock->grphi,
TPS("rel"));
rcu_exp_wake(rsp, s);
trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
mutex_unlock(&rsp->exp_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment