Commit 25f27ce4 authored by Paul E. McKenney's avatar Paul E. McKenney

Merge branches 'doc.2013.08.19a', 'fixes.2013.08.20a', 'sysidle.2013.08.31a'...

Merge branches 'doc.2013.08.19a', 'fixes.2013.08.20a', 'sysidle.2013.08.31a' and 'torture.2013.08.20a' into HEAD

doc.2013.08.19a: Documentation updates
fixes.2013.08.20a: Miscellaneous fixes
sysidle.2013.08.31a: Detect system-wide idle state.
torture.2013.08.20a: rcutorture updates.
......@@ -42,6 +42,16 @@ fqs_holdoff Holdoff time (in microseconds) between consecutive calls
fqs_stutter Wait time (in seconds) between consecutive bursts
of calls to force_quiescent_state().
gp_normal Make the fake writers use normal synchronous grace-period
primitives.
gp_exp Make the fake writers use expedited synchronous grace-period
primitives. If both gp_normal and gp_exp are set, or
if neither gp_normal nor gp_exp are set, then randomly
choose the primitive so that about 50% are normal and
50% expedited. By default, neither are set, which
gives best overall test coverage.
irqreader Says to invoke RCU readers from irq level. This is currently
done via timers. Defaults to "1" for variants of RCU that
permit this. (Or, more accurately, variants of RCU that do
......
......@@ -24,8 +24,8 @@ There are three main ways of managing scheduling-clock interrupts
workloads, you will normally -not- want this option.
These three cases are described in the following three sections, followed
by a third section on RCU-specific considerations and a fourth and final
section listing known issues.
by a third section on RCU-specific considerations, a fourth section
discussing testing, and a fifth and final section listing known issues.
NEVER OMIT SCHEDULING-CLOCK TICKS
......@@ -121,14 +121,15 @@ boot parameter specifies the adaptive-ticks CPUs. For example,
"nohz_full=1,6-8" says that CPUs 1, 6, 7, and 8 are to be adaptive-ticks
CPUs. Note that you are prohibited from marking all of the CPUs as
adaptive-tick CPUs: At least one non-adaptive-tick CPU must remain
online to handle timekeeping tasks in order to ensure that system calls
like gettimeofday() returns accurate values on adaptive-tick CPUs.
(This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no
running user processes to observe slight drifts in clock rate.)
Therefore, the boot CPU is prohibited from entering adaptive-ticks
mode. Specifying a "nohz_full=" mask that includes the boot CPU will
result in a boot-time error message, and the boot CPU will be removed
from the mask.
online to handle timekeeping tasks in order to ensure that system
calls like gettimeofday() returns accurate values on adaptive-tick CPUs.
(This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no running
user processes to observe slight drifts in clock rate.) Therefore, the
boot CPU is prohibited from entering adaptive-ticks mode. Specifying a
"nohz_full=" mask that includes the boot CPU will result in a boot-time
error message, and the boot CPU will be removed from the mask. Note that
this means that your system must have at least two CPUs in order for
CONFIG_NO_HZ_FULL=y to do anything for you.
Alternatively, the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter specifies
that all CPUs other than the boot CPU are adaptive-ticks CPUs. This
......@@ -232,6 +233,29 @@ scheduler will decide where to run them, which might or might not be
where you want them to run.
TESTING
So you enable all the OS-jitter features described in this document,
but do not see any change in your workload's behavior. Is this because
your workload isn't affected that much by OS jitter, or is it because
something else is in the way? This section helps answer this question
by providing a simple OS-jitter test suite, which is available on branch
master of the following git archive:
git://git.kernel.org/pub/scm/linux/kernel/git/frederic/dynticks-testing.git
Clone this archive and follow the instructions in the README file.
This test procedure will produce a trace that will allow you to evaluate
whether or not you have succeeded in removing OS jitter from your system.
If this trace shows that you have removed OS jitter as much as is
possible, then you can conclude that your workload is not all that
sensitive to OS jitter.
Note: this test requires that your system have at least two CPUs.
We do not currently have a good way to remove OS jitter from single-CPU
systems.
KNOWN ISSUES
o Dyntick-idle slows transitions to and from idle slightly.
......
......@@ -63,7 +63,7 @@ struct debug_obj_descr {
extern void debug_object_init (void *addr, struct debug_obj_descr *descr);
extern void
debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr);
extern void debug_object_activate (void *addr, struct debug_obj_descr *descr);
extern int debug_object_activate (void *addr, struct debug_obj_descr *descr);
extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr);
extern void debug_object_destroy (void *addr, struct debug_obj_descr *descr);
extern void debug_object_free (void *addr, struct debug_obj_descr *descr);
......@@ -85,8 +85,8 @@ static inline void
debug_object_init (void *addr, struct debug_obj_descr *descr) { }
static inline void
debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr) { }
static inline void
debug_object_activate (void *addr, struct debug_obj_descr *descr) { }
static inline int
debug_object_activate (void *addr, struct debug_obj_descr *descr) { return 0; }
static inline void
debug_object_deactivate(void *addr, struct debug_obj_descr *descr) { }
static inline void
......
......@@ -101,13 +101,13 @@ static inline u64 get_jiffies_64(void)
#define time_after(a,b) \
(typecheck(unsigned long, a) && \
typecheck(unsigned long, b) && \
((long)(b) - (long)(a) < 0))
((long)((b) - (a)) < 0))
#define time_before(a,b) time_after(b,a)
#define time_after_eq(a,b) \
(typecheck(unsigned long, a) && \
typecheck(unsigned long, b) && \
((long)(a) - (long)(b) >= 0))
((long)((a) - (b)) >= 0))
#define time_before_eq(a,b) time_after_eq(b,a)
/*
......@@ -130,13 +130,13 @@ static inline u64 get_jiffies_64(void)
#define time_after64(a,b) \
(typecheck(__u64, a) && \
typecheck(__u64, b) && \
((__s64)(b) - (__s64)(a) < 0))
((__s64)((b) - (a)) < 0))
#define time_before64(a,b) time_after64(b,a)
#define time_after_eq64(a,b) \
(typecheck(__u64, a) && \
typecheck(__u64, b) && \
((__s64)(a) - (__s64)(b) >= 0))
((__s64)((a) - (b)) >= 0))
#define time_before_eq64(a,b) time_after_eq64(b,a)
#define time_in_range64(a, b, c) \
......
......@@ -267,8 +267,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
*/
#define list_first_or_null_rcu(ptr, type, member) \
({struct list_head *__ptr = (ptr); \
struct list_head __rcu *__next = list_next_rcu(__ptr); \
likely(__ptr != __next) ? container_of(__next, type, member) : NULL; \
struct list_head *__next = ACCESS_ONCE(__ptr->next); \
likely(__ptr != __next) ? \
list_entry_rcu(__next, type, member) : NULL; \
})
/**
......
......@@ -229,13 +229,9 @@ extern void rcu_irq_exit(void);
#ifdef CONFIG_RCU_USER_QS
extern void rcu_user_enter(void);
extern void rcu_user_exit(void);
extern void rcu_user_enter_after_irq(void);
extern void rcu_user_exit_after_irq(void);
#else
static inline void rcu_user_enter(void) { }
static inline void rcu_user_exit(void) { }
static inline void rcu_user_enter_after_irq(void) { }
static inline void rcu_user_exit_after_irq(void) { }
static inline void rcu_user_hooks_switch(struct task_struct *prev,
struct task_struct *next) { }
#endif /* CONFIG_RCU_USER_QS */
......@@ -1015,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
/* Only for use by adaptive-ticks code. */
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
extern bool rcu_sys_is_idle(void);
extern void rcu_sysidle_force_exit(void);
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
static inline bool rcu_sys_is_idle(void)
{
return false;
}
static inline void rcu_sysidle_force_exit(void)
{
}
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
#endif /* __LINUX_RCUPDATE_H */
......@@ -470,6 +470,7 @@ config TREE_RCU
config TREE_PREEMPT_RCU
bool "Preemptible tree-based hierarchical RCU"
depends on PREEMPT
select IRQ_WORK
help
This option selects the RCU implementation that is
designed for very large SMP systems with hundreds or
......
......@@ -67,12 +67,15 @@
extern struct debug_obj_descr rcuhead_debug_descr;
static inline void debug_rcu_head_queue(struct rcu_head *head)
static inline int debug_rcu_head_queue(struct rcu_head *head)
{
debug_object_activate(head, &rcuhead_debug_descr);
int r1;
r1 = debug_object_activate(head, &rcuhead_debug_descr);
debug_object_active_state(head, &rcuhead_debug_descr,
STATE_RCU_HEAD_READY,
STATE_RCU_HEAD_QUEUED);
return r1;
}
static inline void debug_rcu_head_unqueue(struct rcu_head *head)
......@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
debug_object_deactivate(head, &rcuhead_debug_descr);
}
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void debug_rcu_head_queue(struct rcu_head *head)
static inline int debug_rcu_head_queue(struct rcu_head *head)
{
return 0;
}
static inline void debug_rcu_head_unqueue(struct rcu_head *head)
......
......@@ -211,43 +211,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head)
debug_object_free(head, &rcuhead_debug_descr);
}
/*
* fixup_init is called when:
* - an active object is initialized
*/
static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
{
struct rcu_head *head = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
/*
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
* In !PREEMPT configurations, there is no way to tell if we are
* in a RCU read-side critical section or not, so we never
* attempt any fixup and just print a warning.
*/
#ifndef CONFIG_PREEMPT
WARN_ON_ONCE(1);
return 0;
#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
rcu_barrier_sched();
rcu_barrier_bh();
debug_object_init(head, &rcuhead_debug_descr);
return 1;
default:
return 0;
}
}
/*
* fixup_activate is called when:
* - an active object is activated
......@@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
debug_object_init(head, &rcuhead_debug_descr);
debug_object_activate(head, &rcuhead_debug_descr);
return 0;
case ODEBUG_STATE_ACTIVE:
/*
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
* In !PREEMPT configurations, there is no way to tell if we are
* in a RCU read-side critical section or not, so we never
* attempt any fixup and just print a warning.
*/
#ifndef CONFIG_PREEMPT
WARN_ON_ONCE(1);
return 0;
#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
rcu_barrier_sched();
rcu_barrier_bh();
debug_object_activate(head, &rcuhead_debug_descr);
return 1;
default:
return 0;
}
}
/*
* fixup_free is called when:
* - an active object is freed
*/
static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
{
struct rcu_head *head = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
/*
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
* In !PREEMPT configurations, there is no way to tell if we are
* in a RCU read-side critical section or not, so we never
* attempt any fixup and just print a warning.
*/
#ifndef CONFIG_PREEMPT
WARN_ON_ONCE(1);
return 0;
#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
rcu_barrier_sched();
rcu_barrier_bh();
debug_object_free(head, &rcuhead_debug_descr);
return 1;
default:
return 0;
}
}
......@@ -369,9 +271,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
.fixup_init = rcuhead_fixup_init,
.fixup_activate = rcuhead_fixup_activate,
.fixup_free = rcuhead_fixup_free,
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
......
This diff is collapsed.
This diff is collapsed.
......@@ -88,6 +88,14 @@ struct rcu_dynticks {
/* Process level is worth LLONG_MAX/2. */
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
long long dynticks_idle_nesting;
/* irq/process nesting level from idle. */
atomic_t dynticks_idle; /* Even value for idle, else odd. */
/* "Idle" excludes userspace execution. */
unsigned long dynticks_idle_jiffies;
/* End of last non-NMI non-idle period. */
#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
#ifdef CONFIG_RCU_FAST_NO_HZ
bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted;
......@@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
static void rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj);
static bool is_sysidle_rcu_state(struct rcu_state *rsp);
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj);
static void rcu_bind_gp_kthread(void);
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
#endif /* #ifndef RCU_TREE_NONCORE */
......
This diff is collapsed.
......@@ -134,6 +134,56 @@ config NO_HZ_FULL_ALL
Note the boot CPU will still be kept outside the range to
handle the timekeeping duty.
config NO_HZ_FULL_SYSIDLE
bool "Detect full-system idle state for full dynticks system"
depends on NO_HZ_FULL
default n
help
At least one CPU must keep the scheduling-clock tick running for
timekeeping purposes whenever there is a non-idle CPU, where
"non-idle" also includes dynticks CPUs as long as they are
running non-idle tasks. Because the underlying adaptive-tick
support cannot distinguish between all CPUs being idle and
all CPUs each running a single task in dynticks mode, the
underlying support simply ensures that there is always a CPU
handling the scheduling-clock tick, whether or not all CPUs
are idle. This Kconfig option enables scalable detection of
the all-CPUs-idle state, thus allowing the scheduling-clock
tick to be disabled when all CPUs are idle. Note that scalable
detection of the all-CPUs-idle state means that larger systems
will be slower to declare the all-CPUs-idle state.
Say Y if you would like to help debug all-CPUs-idle detection.
Say N if you are unsure.
config NO_HZ_FULL_SYSIDLE_SMALL
int "Number of CPUs above which large-system approach is used"
depends on NO_HZ_FULL_SYSIDLE
range 1 NR_CPUS
default 8
help
The full-system idle detection mechanism takes a lazy approach
on large systems, as is required to attain decent scalability.
However, on smaller systems, scalability is not anywhere near as
large a concern as is energy efficiency. The sysidle subsystem
therefore uses a fast but non-scalable algorithm for small
systems and a lazier but scalable algorithm for large systems.
This Kconfig parameter defines the number of CPUs in the largest
system that will be considered to be "small".
The default value will be fine in most cases. Battery-powered
systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
numbers of CPUs, and (3) are suffering from battery-lifetime
problems due to long sysidle latencies might wish to experiment
with larger values for this Kconfig parameter. On the other
hand, they might be even better served by disabling NO_HZ_FULL
entirely, given that NO_HZ_FULL is intended for HPC and
real-time workloads that at present do not tend to be run on
battery-powered systems.
Take the default if you are unsure.
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
......
......@@ -381,19 +381,21 @@ void debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr)
* debug_object_activate - debug checks when an object is activated
* @addr: address of the object
* @descr: pointer to an object specific debug description structure
* Returns 0 for success, -EINVAL for check failed.
*/
void debug_object_activate(void *addr, struct debug_obj_descr *descr)
int debug_object_activate(void *addr, struct debug_obj_descr *descr)
{
enum debug_obj_state state;
struct debug_bucket *db;
struct debug_obj *obj;
unsigned long flags;
int ret;
struct debug_obj o = { .object = addr,
.state = ODEBUG_STATE_NOTAVAILABLE,
.descr = descr };
if (!debug_objects_enabled)
return;
return 0;
db = get_bucket((unsigned long) addr);
......@@ -405,23 +407,26 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr)
case ODEBUG_STATE_INIT:
case ODEBUG_STATE_INACTIVE:
obj->state = ODEBUG_STATE_ACTIVE;
ret = 0;
break;
case ODEBUG_STATE_ACTIVE:
debug_print_object(obj, "activate");
state = obj->state;
raw_spin_unlock_irqrestore(&db->lock, flags);
debug_object_fixup(descr->fixup_activate, addr, state);
return;
ret = debug_object_fixup(descr->fixup_activate, addr, state);
return ret ? -EINVAL : 0;
case ODEBUG_STATE_DESTROYED:
debug_print_object(obj, "activate");
ret = -EINVAL;
break;
default:
ret = 0;
break;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
return;
return ret;
}
raw_spin_unlock_irqrestore(&db->lock, flags);
......@@ -431,8 +436,11 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr)
* true or not.
*/
if (debug_object_fixup(descr->fixup_activate, addr,
ODEBUG_STATE_NOTAVAILABLE))
ODEBUG_STATE_NOTAVAILABLE)) {
debug_print_object(&o, "activate");
return -EINVAL;
}
return 0;
}
/**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment