Commit 7f6dcffb authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched-rt-2022-10-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull preempt RT updates from Thomas Gleixner:
 "Introduce preempt_[dis|enable_nested() and use it to clean up various
  places which have open coded PREEMPT_RT conditionals.

  On PREEMPT_RT enabled kernels, spinlocks and rwlocks are neither
  disabling preemption nor interrupts. Though there are a few places
  which depend on the implicit preemption/interrupt disable of those
  locks, e.g. seqcount write sections, per CPU statistics updates etc.

  PREEMPT_RT added open coded CONFIG_PREEMPT_RT conditionals to
  disable/enable preemption in the related code parts all over the
  place. That's hard to read and does not really explain why this is
  necessary.

  Linus suggested to use helper functions (preempt_disable_nested() and
  preempt_enable_nested()) and use those in the affected places. On !RT
  enabled kernels these functions are NOPs, but contain a lockdep assert
  to validate that preemption is actually disabled to catch call sites
  which do not have preemption disabled.

  Clean up the affected code paths in mm, dentry and lib"

* tag 'sched-rt-2022-10-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  u64_stats: Streamline the implementation
  flex_proportions: Disable preemption entering the write section.
  mm/compaction: Get rid of RT ifdeffery
  mm/memcontrol: Replace the PREEMPT_RT conditionals
  mm/debug: Provide VM_WARN_ON_IRQS_ENABLED()
  mm/vmstat: Use preempt_[dis|en]able_nested()
  dentry: Use preempt_[dis|en]able_nested()
  preempt: Provide preempt_[dis|en]able_nested()
parents 65f109e1 44b0c295
......@@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash);
static inline unsigned start_dir_add(struct inode *dir)
{
/*
* The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT
* kernels spin_lock() implicitly disables preemption, but not on
* PREEMPT_RT. So for RT it has to be done explicitly to protect
* the sequence count write side critical section against a reader
* or another writer preempting, which would result in a live lock.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
for (;;) {
unsigned n = dir->i_dir_seq;
if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
......@@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
wait_queue_head_t *d_wait)
{
smp_store_release(&dir->i_dir_seq, n + 2);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
wake_up_all(d_wait);
}
......
......@@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm);
#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
#endif
#ifdef CONFIG_DEBUG_VM_IRQSOFF
#define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled())
#else
#define VM_WARN_ON_IRQS_ENABLED() do { } while (0)
#endif
#ifdef CONFIG_DEBUG_VIRTUAL
#define VIRTUAL_BUG_ON(cond) BUG_ON(cond)
#else
......
......@@ -421,4 +421,46 @@ static inline void migrate_enable(void) { }
#endif /* CONFIG_SMP */
/**
* preempt_disable_nested - Disable preemption inside a normally preempt disabled section
*
* Use for code which requires preemption protection inside a critical
* section which has preemption disabled implicitly on non-PREEMPT_RT
* enabled kernels, by e.g.:
* - holding a spinlock/rwlock
* - soft interrupt context
* - regular interrupt handlers
*
* On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft
* interrupt context and regular interrupt handlers are preemptible and
* only prevent migration. preempt_disable_nested() ensures that preemption
* is disabled for cases which require CPU local serialization even on
* PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP.
*
* The use cases are code sequences which are not serialized by a
* particular lock instance, e.g.:
* - seqcount write side critical sections where the seqcount is not
* associated to a particular lock and therefore the automatic
* protection mechanism does not work. This prevents a live lock
* against a preempting high priority reader.
* - RMW per CPU variable updates like vmstat.
*/
/* Macro to avoid header recursion hell vs. lockdep */
#define preempt_disable_nested() \
do { \
if (IS_ENABLED(CONFIG_PREEMPT_RT)) \
preempt_disable(); \
else \
lockdep_assert_preemption_disabled(); \
} while (0)
/**
* preempt_enable_nested - Undo the effect of preempt_disable_nested()
*/
static __always_inline void preempt_enable_nested(void)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
}
#endif /* __LINUX_PREEMPT_H */
......@@ -8,7 +8,7 @@
*
* Key points :
*
* - Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP.
* - Use a seqcount on 32-bit
* - The whole thing is a no-op on 64-bit architectures.
*
* Usage constraints:
......@@ -20,7 +20,8 @@
* writer and also spin forever.
*
* 3) Write side must use the _irqsave() variant if other writers, or a reader,
* can be invoked from an IRQ context.
* can be invoked from an IRQ context. On 64bit systems this variant does not
* disable interrupts.
*
* 4) If reader fetches several counters, there is no guarantee the whole values
* are consistent w.r.t. each other (remember point #2: seqcounts are not
......@@ -29,11 +30,6 @@
* 5) Readers are allowed to sleep or be preempted/interrupted: they perform
* pure reads.
*
* 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats
* might be updated from a hardirq or softirq context (remember point #1:
* seqcounts are not used for UP kernels). 32-bit UP stat readers could read
* corrupted 64-bit values otherwise.
*
* Usage :
*
* Stats producer (writer) should use following template granted it already got
......@@ -66,7 +62,7 @@
#include <linux/seqlock.h>
struct u64_stats_sync {
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
#if BITS_PER_LONG == 32
seqcount_t seq;
#endif
};
......@@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p)
local64_inc(&p->v);
}
#else
static inline void u64_stats_init(struct u64_stats_sync *syncp) { }
static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { }
static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { }
static inline unsigned long __u64_stats_irqsave(void) { return 0; }
static inline void __u64_stats_irqrestore(unsigned long flags) { }
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
return 0;
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
return false;
}
#else /* 64 bit */
typedef struct {
u64 v;
......@@ -123,123 +134,95 @@ static inline void u64_stats_inc(u64_stats_t *p)
{
p->v++;
}
#endif
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
#define u64_stats_init(syncp) seqcount_init(&(syncp)->seq)
#else
static inline void u64_stats_init(struct u64_stats_sync *syncp)
{
seqcount_init(&syncp->seq);
}
#endif
static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
write_seqcount_begin(&syncp->seq);
#endif
}
static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
static inline void __u64_stats_update_end(struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
write_seqcount_end(&syncp->seq);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
#endif
preempt_enable_nested();
}
static inline unsigned long
u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
static inline unsigned long __u64_stats_irqsave(void)
{
unsigned long flags = 0;
unsigned long flags;
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
else
local_irq_save(flags);
write_seqcount_begin(&syncp->seq);
#endif
return flags;
}
static inline void
u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
unsigned long flags)
static inline void __u64_stats_irqrestore(unsigned long flags)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
write_seqcount_end(&syncp->seq);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
else
local_irq_restore(flags);
#endif
}
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
return read_seqcount_begin(&syncp->seq);
#else
return 0;
#endif
}
static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
preempt_disable();
#endif
return __u64_stats_fetch_begin(syncp);
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT))
return read_seqcount_retry(&syncp->seq, start);
#else
return false;
#endif
}
#endif /* !64 bit */
static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
{
__u64_stats_update_begin(syncp);
}
static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
{
__u64_stats_update_end(syncp);
}
static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
{
unsigned long flags = __u64_stats_irqsave();
__u64_stats_update_begin(syncp);
return flags;
}
static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
unsigned long flags)
{
__u64_stats_update_end(syncp);
__u64_stats_irqrestore(flags);
}
static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
return __u64_stats_fetch_begin(syncp);
}
static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT))
preempt_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
}
/*
* In case irq handlers can update u64 counters, readers can use following helpers
* - SMP 32bit arches use seqcount protection, irq safe.
* - UP 32bit must disable irqs.
* - 64bit have no problem atomically reading u64 values, irq safe.
*/
/* Obsolete interfaces */
static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
preempt_disable();
#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
local_irq_disable();
#endif
return __u64_stats_fetch_begin(syncp);
return u64_stats_fetch_begin(syncp);
}
static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
unsigned int start)
{
#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT)
preempt_enable();
#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP)
local_irq_enable();
#endif
return __u64_stats_fetch_retry(syncp, start);
return u64_stats_fetch_retry(syncp, start);
}
#endif /* _LINUX_U64_STATS_SYNC_H */
......@@ -805,6 +805,9 @@ config ARCH_HAS_DEBUG_VM_PGTABLE
An architecture should select this when it can successfully
build and run DEBUG_VM_PGTABLE.
config DEBUG_VM_IRQSOFF
def_bool DEBUG_VM && !PREEMPT_RT
config DEBUG_VM
bool "Debug VM"
depends on DEBUG_KERNEL
......
......@@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_global *p, int periods)
*/
if (events <= 1)
return false;
preempt_disable_nested();
write_seqcount_begin(&p->sequence);
if (periods < 64)
events -= events >> periods;
......@@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_global *p, int periods)
percpu_counter_add(&p->events, -events);
p->period += periods;
write_seqcount_end(&p->sequence);
preempt_enable_nested();
return true;
}
......
......@@ -579,6 +579,12 @@ config COMPACTION
it and then we would be really interested to hear about that at
linux-mm@kvack.org.
config COMPACT_UNEVICTABLE_DEFAULT
int
depends on COMPACTION
default 0 if PREEMPT_RT
default 1
#
# support for free page reporting
config PAGE_REPORTING
......
......@@ -1727,11 +1727,7 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
#ifdef CONFIG_PREEMPT_RT
int sysctl_compact_unevictable_allowed __read_mostly = 0;
#else
int sysctl_compact_unevictable_allowed __read_mostly = 1;
#endif
int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
......
......@@ -597,25 +597,18 @@ static u64 flush_next_time;
*/
static void memcg_stats_lock(void)
{
#ifdef CONFIG_PREEMPT_RT
preempt_disable();
#else
VM_BUG_ON(!irqs_disabled());
#endif
preempt_disable_nested();
VM_WARN_ON_IRQS_ENABLED();
}
static void __memcg_stats_lock(void)
{
#ifdef CONFIG_PREEMPT_RT
preempt_disable();
#endif
preempt_disable_nested();
}
static void memcg_stats_unlock(void)
{
#ifdef CONFIG_PREEMPT_RT
preempt_enable();
#endif
preempt_enable_nested();
}
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
......@@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
* interrupt context while other caller need to have disabled interrupt.
*/
__memcg_stats_lock();
if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
if (IS_ENABLED(CONFIG_DEBUG_VM)) {
switch (idx) {
case NR_ANON_MAPPED:
case NR_FILE_MAPPED:
......@@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
WARN_ON_ONCE(!in_task());
break;
default:
WARN_ON_ONCE(!irqs_disabled());
VM_WARN_ON_IRQS_ENABLED();
}
}
......
......@@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
* CPU migrations and preemption potentially corrupts a counter so
* disable preemption.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
x = delta + __this_cpu_read(*p);
......@@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
}
__this_cpu_write(*p, x);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_zone_page_state);
......@@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
x = delta + __this_cpu_read(*p);
......@@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
__this_cpu_write(*p, x);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_node_page_state);
......@@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
......@@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, -overstep);
}
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
......@@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
......@@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, -overstep);
}
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
......@@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
......@@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, overstep);
}
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
......@@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
......@@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, overstep);
}
if (IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
preempt_enable_nested();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment