Commit 385b73c0 authored by Paul E. McKenney's avatar Paul E. McKenney

rcu: Get rid of synchronize_sched_expedited()'s polling loop

This commit gets rid of synchronize_sched_expedited()'s mutex_trylock()
polling loop in favor of a funnel-locking scheme based on the rcu_node
tree.  The work-done check is done at each level of the tree, allowing
high-contention situations to be resolved quickly with reasonable levels
of mutex contention.
Signed-off-by: default avatarPaul E. McKenney <paulmck@linux.vnet.ibm.com>
parent d6ada2cf
...@@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree"); ...@@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree");
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
/* /*
* In order to export the rcu_state name to the tracing tools, it * In order to export the rcu_state name to the tracing tools, it
...@@ -103,7 +104,6 @@ struct rcu_state sname##_state = { \ ...@@ -103,7 +104,6 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \ .orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \ .name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \ .abbr = sabbr, \
} }
...@@ -3272,6 +3272,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data) ...@@ -3272,6 +3272,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
return 0; return 0;
} }
/* Common code for synchronize_sched_expedited() work-done checking. */
static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp,
atomic_long_t *stat, unsigned long s)
{
if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
if (rnp)
mutex_unlock(&rnp->exp_funnel_mutex);
/* Ensure test happens before caller kfree(). */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(stat);
put_online_cpus();
return true;
}
return false;
}
/** /**
* synchronize_sched_expedited - Brute-force RCU-sched grace period * synchronize_sched_expedited - Brute-force RCU-sched grace period
* *
...@@ -3286,15 +3302,15 @@ static int synchronize_sched_expedited_cpu_stop(void *data) ...@@ -3286,15 +3302,15 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* This implementation can be thought of as an application of sequence * This implementation can be thought of as an application of sequence
* locking to expedited grace periods, but using the sequence counter to * locking to expedited grace periods, but using the sequence counter to
* determine when someone else has already done the work instead of for * determine when someone else has already done the work instead of for
* retrying readers. We do a mutex_trylock() polling loop, but if we fail * retrying readers.
* too many times in a row, we fall back to synchronize_sched().
*/ */
void synchronize_sched_expedited(void) void synchronize_sched_expedited(void)
{ {
int cpu; int cpu;
long s; long s;
int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state; struct rcu_state *rsp = &rcu_sched_state;
struct rcu_node *rnp0;
struct rcu_node *rnp1 = NULL;
/* Take a snapshot of the sequence number. */ /* Take a snapshot of the sequence number. */
smp_mb(); /* Caller's modifications seen first by other CPUs. */ smp_mb(); /* Caller's modifications seen first by other CPUs. */
...@@ -3310,60 +3326,25 @@ void synchronize_sched_expedited(void) ...@@ -3310,60 +3326,25 @@ void synchronize_sched_expedited(void)
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
/* /*
* Each pass through the following loop attempts to acquire * Each pass through the following loop works its way
* ->expedited_mutex, checking for others doing our work each time. * up the rcu_node tree, returning if others have done the
*/ * work or otherwise falls through holding the root rnp's
while (!mutex_trylock(&rsp->expedited_mutex)) { * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
put_online_cpus(); * can be inexact, as it is just promoting locality and is not
atomic_long_inc(&rsp->expedited_tryfail); * strictly needed for correctness.
*/
/* Check to see if someone else did our work for us. */ rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { for (; rnp0 != NULL; rnp0 = rnp0->parent) {
/* ensure test happens before caller kfree */ if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
return; return;
mutex_lock(&rnp0->exp_funnel_mutex);
if (rnp1)
mutex_unlock(&rnp1->exp_funnel_mutex);
rnp1 = rnp0;
} }
rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */
/* No joy, try again later. Or just synchronize_sched(). */ if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
if (trycount++ < 10) {
udelay(trycount * num_online_cpus());
} else {
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
return; return;
}
/* Recheck to see if someone else did our work for us. */
if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
return;
}
/*
* Refetching sync_sched_expedited_started allows later
* callers to piggyback on our grace period. We retry
* after they started, so our grace period works for them,
* and they started after our first try, so their grace
* period works for us.
*/
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, use normal GP. */
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
return;
}
}
/* Recheck yet again to see if someone else did our work for us. */
if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
rsp->expedited_workdone3++;
mutex_unlock(&rsp->expedited_mutex);
smp_mb(); /* ensure test happens before caller kfree */
return;
}
WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
smp_mb(); /* Ensure expedited GP seen after counter increment. */ smp_mb(); /* Ensure expedited GP seen after counter increment. */
...@@ -3383,7 +3364,7 @@ void synchronize_sched_expedited(void) ...@@ -3383,7 +3364,7 @@ void synchronize_sched_expedited(void)
smp_mb(); /* Ensure expedited GP seen before counter increment. */ smp_mb(); /* Ensure expedited GP seen before counter increment. */
WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
WARN_ON_ONCE(rsp->expedited_sequence & 0x1); WARN_ON_ONCE(rsp->expedited_sequence & 0x1);
mutex_unlock(&rsp->expedited_mutex); mutex_unlock(&rnp0->exp_funnel_mutex);
smp_mb(); /* ensure subsequent action seen after grace period. */ smp_mb(); /* ensure subsequent action seen after grace period. */
put_online_cpus(); put_online_cpus();
...@@ -3940,6 +3921,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, ...@@ -3940,6 +3921,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
{ {
static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT;
static const char * const exp[] = RCU_EXP_NAME_INIT;
static u8 fl_mask = 0x1; static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
...@@ -3998,6 +3980,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, ...@@ -3998,6 +3980,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->level = i; rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks); INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp); rcu_init_one_nocb(rnp);
mutex_init(&rnp->exp_funnel_mutex);
lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
&rcu_exp_class[i], exp[i]);
} }
} }
......
...@@ -68,6 +68,7 @@ ...@@ -68,6 +68,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
# define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_NODE_NAME_INIT { "rcu_node_0" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
#elif NR_CPUS <= RCU_FANOUT_2 #elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2 # define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
...@@ -76,6 +77,7 @@ ...@@ -76,6 +77,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
#elif NR_CPUS <= RCU_FANOUT_3 #elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3 # define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
...@@ -85,6 +87,7 @@ ...@@ -85,6 +87,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
#elif NR_CPUS <= RCU_FANOUT_4 #elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4 # define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
...@@ -95,6 +98,7 @@ ...@@ -95,6 +98,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
#else #else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
...@@ -237,6 +241,8 @@ struct rcu_node { ...@@ -237,6 +241,8 @@ struct rcu_node {
int need_future_gp[2]; int need_future_gp[2];
/* Counts of upcoming no-CB GP requests. */ /* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp;
/* /*
...@@ -480,12 +486,10 @@ struct rcu_state { ...@@ -480,12 +486,10 @@ struct rcu_state {
/* _rcu_barrier(). */ /* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */ /* End of fields guarded by barrier_mutex. */
struct mutex expedited_mutex; /* Serializes expediting. */
unsigned long expedited_sequence; /* Take a ticket. */ unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_tryfail; /* # acquisition failures. */ atomic_long_t expedited_tryfail; /* # acquisition failures. */
atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone1; /* # done by others #1. */
atomic_long_t expedited_workdone2; /* # done by others #2. */ atomic_long_t expedited_workdone2; /* # done by others #2. */
unsigned long expedited_workdone3; /* # done by others #3. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */
unsigned long jiffies_force_qs; /* Time at which to invoke */ unsigned long jiffies_force_qs; /* Time at which to invoke */
......
...@@ -185,12 +185,11 @@ static int show_rcuexp(struct seq_file *m, void *v) ...@@ -185,12 +185,11 @@ static int show_rcuexp(struct seq_file *m, void *v)
{ {
struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_state *rsp = (struct rcu_state *)m->private;
seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu sc=%lu\n", seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n",
rsp->expedited_sequence, rsp->expedited_sequence,
atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_tryfail),
atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone1),
atomic_long_read(&rsp->expedited_workdone2), atomic_long_read(&rsp->expedited_workdone2),
rsp->expedited_workdone3,
atomic_long_read(&rsp->expedited_normal), atomic_long_read(&rsp->expedited_normal),
rsp->expedited_sequence / 2); rsp->expedited_sequence / 2);
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment