Commit 223baf9d authored by Mathieu Desnoyers's avatar Mathieu Desnoyers Committed by Peter Zijlstra

sched: Fix performance regression introduced by mm_cid

Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL
sysbench regression reported by Aaron Lu.

Keep track of the currently allocated mm_cid for each mm/cpu rather than
freeing them immediately on context switch. This eliminates most atomic
operations when context switching back and forth between threads
belonging to different memory spaces in multi-threaded scenarios (many
processes, each with many threads). The per-mm/per-cpu mm_cid values are
serialized by their respective runqueue locks.

Thread migration is handled by introducing invocation to
sched_mm_cid_migrate_to() (with destination runqueue lock held) in
activate_task() for migrating tasks. If the destination cpu's mm_cid is
unset, and if the source runqueue is not actively using its mm_cid, then
the source cpu's mm_cid is moved to the destination cpu on migration.

Introduce a task-work executed periodically, similarly to NUMA work,
which delays reclaim of cid values when they are unused for a period of
time.

Keep track of the allocation time for each per-cpu cid, and let the task
work clear them when they are observed to be older than
SCHED_MM_CID_PERIOD_NS and unused. This task work also clears all
mm_cids which are greater or equal to the Hamming weight of the mm
cidmask to keep concurrency ids compact.

Because we want to ensure the mm_cid converges towards the smaller
values as migrations happen, the prior optimization that was done when
context switching between threads belonging to the same mm is removed,
because it could delay the lazy release of the destination runqueue
mm_cid after it has been replaced by a migration. Removing this prior
optimization is not an issue performance-wise because the introduced
per-mm/per-cpu mm_cid tracking also covers this more specific case.

Fixes: af7f588d ("sched: Introduce per-memory-map concurrency ID")
Reported-by: default avatarAaron Lu <aaron.lu@intel.com>
Signed-off-by: default avatarMathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: default avatarAaron Lu <aaron.lu@intel.com>
Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
parent 5a4d3b38
......@@ -550,6 +550,13 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
#ifdef CONFIG_SCHED_MM_CID
struct mm_cid {
u64 time;
int cid;
};
#endif
struct kioctx_table;
struct mm_struct {
struct {
......@@ -600,15 +607,19 @@ struct mm_struct {
atomic_t mm_count;
#ifdef CONFIG_SCHED_MM_CID
/**
* @cid_lock: Protect cid bitmap updates vs lookups.
* @pcpu_cid: Per-cpu current cid.
*
* Prevent situations where updates to the cid bitmap happen
* concurrently with lookups. Those can lead to situations
* where a lookup cannot find a free bit simply because it was
* unlucky enough to load, non-atomically, bitmap words as they
* were being concurrently updated by the updaters.
* Keep track of the currently allocated mm_cid for each cpu.
* The per-cpu mm_cid values are serialized by their respective
* runqueue locks.
*/
raw_spinlock_t cid_lock;
struct mm_cid __percpu *pcpu_cid;
/*
* @mm_cid_next_scan: Next mm_cid scan (in jiffies).
*
* When the next mm_cid scan is due (in jiffies).
*/
unsigned long mm_cid_next_scan;
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
......@@ -873,6 +884,37 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
}
#ifdef CONFIG_SCHED_MM_CID
enum mm_cid_state {
MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
MM_CID_LAZY_PUT = (1U << 31),
};
static inline bool mm_cid_is_unset(int cid)
{
return cid == MM_CID_UNSET;
}
static inline bool mm_cid_is_lazy_put(int cid)
{
return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
}
static inline bool mm_cid_is_valid(int cid)
{
return !(cid & MM_CID_LAZY_PUT);
}
static inline int mm_cid_set_lazy_put(int cid)
{
return cid | MM_CID_LAZY_PUT;
}
static inline int mm_cid_clear_lazy_put(int cid)
{
return cid & ~MM_CID_LAZY_PUT;
}
/* Accessor for struct mm_struct's cidmask. */
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
{
......@@ -886,16 +928,40 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
static inline void mm_init_cid(struct mm_struct *mm)
{
raw_spin_lock_init(&mm->cid_lock);
int i;
for_each_possible_cpu(i) {
struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
pcpu_cid->cid = MM_CID_UNSET;
pcpu_cid->time = 0;
}
cpumask_clear(mm_cidmask(mm));
}
static inline int mm_alloc_cid(struct mm_struct *mm)
{
mm->pcpu_cid = alloc_percpu(struct mm_cid);
if (!mm->pcpu_cid)
return -ENOMEM;
mm_init_cid(mm);
return 0;
}
static inline void mm_destroy_cid(struct mm_struct *mm)
{
free_percpu(mm->pcpu_cid);
mm->pcpu_cid = NULL;
}
static inline unsigned int mm_cid_size(void)
{
return cpumask_size();
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm) { }
static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
static inline void mm_destroy_cid(struct mm_struct *mm) { }
static inline unsigned int mm_cid_size(void)
{
return 0;
......
......@@ -1314,7 +1314,10 @@ struct task_struct {
#ifdef CONFIG_SCHED_MM_CID
int mm_cid; /* Current cid in mm */
int last_mm_cid; /* Most recent cid in mm */
int migrate_from_cpu;
int mm_cid_active; /* Whether cid bitmap is active */
struct callback_head cid_work;
#endif
struct tlbflush_unmap_batch tlb_ubc;
......
......@@ -37,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm)
atomic_inc(&mm->mm_count);
}
static inline void smp_mb__after_mmgrab(void)
{
smp_mb__after_atomic();
}
extern void __mmdrop(struct mm_struct *mm);
static inline void mmdrop(struct mm_struct *mm)
......
......@@ -793,6 +793,7 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
percpu_counter_destroy(&mm->rss_stat[i]);
......@@ -1057,7 +1058,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
tsk->migrate_from_cpu = -1;
#endif
return tsk;
......@@ -1162,18 +1165,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
if (mm_alloc_cid(mm))
goto fail_cid;
for (i = 0; i < NR_MM_COUNTERS; i++)
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
mm_init_cid(mm);
return mm;
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
mm_destroy_cid(mm);
fail_cid:
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
......
This diff is collapsed.
......@@ -3253,61 +3253,238 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
}
#ifdef CONFIG_SCHED_MM_CID
static inline int __mm_cid_get(struct mm_struct *mm)
#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
#define MM_CID_SCAN_DELAY 100 /* 100ms */
extern raw_spinlock_t cid_lock;
extern int use_cid_lock;
extern void sched_mm_cid_migrate_from(struct task_struct *t);
extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
extern void init_sched_mm_cid(struct task_struct *t);
static inline void __mm_cid_put(struct mm_struct *mm, int cid)
{
if (cid < 0)
return;
cpumask_clear_cpu(cid, mm_cidmask(mm));
}
/*
* The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
* the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
* be held to transition to other states.
*
* State transitions synchronized with cmpxchg or try_cmpxchg need to be
* consistent across cpus, which prevents use of this_cpu_cmpxchg.
*/
static inline void mm_cid_put_lazy(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
int cid;
lockdep_assert_irqs_disabled();
cid = __this_cpu_read(pcpu_cid->cid);
if (!mm_cid_is_lazy_put(cid) ||
!try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
return;
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
{
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
int cid, res;
lockdep_assert_irqs_disabled();
cid = __this_cpu_read(pcpu_cid->cid);
for (;;) {
if (mm_cid_is_unset(cid))
return MM_CID_UNSET;
/*
* Attempt transition from valid or lazy-put to unset.
*/
res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
if (res == cid)
break;
cid = res;
}
return cid;
}
static inline void mm_cid_put(struct mm_struct *mm)
{
int cid;
lockdep_assert_irqs_disabled();
cid = mm_cid_pcpu_unset(mm);
if (cid == MM_CID_UNSET)
return;
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
static inline int __mm_cid_try_get(struct mm_struct *mm)
{
struct cpumask *cpumask;
int cid;
cpumask = mm_cidmask(mm);
cid = cpumask_first_zero(cpumask);
if (cid >= nr_cpu_ids)
/*
* Retry finding first zero bit if the mask is temporarily
* filled. This only happens during concurrent remote-clear
* which owns a cid without holding a rq lock.
*/
for (;;) {
cid = cpumask_first_zero(cpumask);
if (cid < nr_cpu_ids)
break;
cpu_relax();
}
if (cpumask_test_and_set_cpu(cid, cpumask))
return -1;
__cpumask_set_cpu(cid, cpumask);
return cid;
}
static inline void mm_cid_put(struct mm_struct *mm, int cid)
/*
* Save a snapshot of the current runqueue time of this cpu
* with the per-cpu cid value, allowing to estimate how recently it was used.
*/
static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
{
lockdep_assert_irqs_disabled();
if (cid < 0)
return;
raw_spin_lock(&mm->cid_lock);
__cpumask_clear_cpu(cid, mm_cidmask(mm));
raw_spin_unlock(&mm->cid_lock);
struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
lockdep_assert_rq_held(rq);
WRITE_ONCE(pcpu_cid->time, rq->clock);
}
static inline int mm_cid_get(struct mm_struct *mm)
static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
{
int ret;
int cid;
lockdep_assert_irqs_disabled();
raw_spin_lock(&mm->cid_lock);
ret = __mm_cid_get(mm);
raw_spin_unlock(&mm->cid_lock);
return ret;
/*
* All allocations (even those using the cid_lock) are lock-free. If
* use_cid_lock is set, hold the cid_lock to perform cid allocation to
* guarantee forward progress.
*/
if (!READ_ONCE(use_cid_lock)) {
cid = __mm_cid_try_get(mm);
if (cid >= 0)
goto end;
raw_spin_lock(&cid_lock);
} else {
raw_spin_lock(&cid_lock);
cid = __mm_cid_try_get(mm);
if (cid >= 0)
goto unlock;
}
/*
* cid concurrently allocated. Retry while forcing following
* allocations to use the cid_lock to ensure forward progress.
*/
WRITE_ONCE(use_cid_lock, 1);
/*
* Set use_cid_lock before allocation. Only care about program order
* because this is only required for forward progress.
*/
barrier();
/*
* Retry until it succeeds. It is guaranteed to eventually succeed once
* all newcoming allocations observe the use_cid_lock flag set.
*/
do {
cid = __mm_cid_try_get(mm);
cpu_relax();
} while (cid < 0);
/*
* Allocate before clearing use_cid_lock. Only care about
* program order because this is for forward progress.
*/
barrier();
WRITE_ONCE(use_cid_lock, 0);
unlock:
raw_spin_unlock(&cid_lock);
end:
mm_cid_snapshot_time(rq, mm);
return cid;
}
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
{
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
struct cpumask *cpumask;
int cid;
lockdep_assert_rq_held(rq);
cpumask = mm_cidmask(mm);
cid = __this_cpu_read(pcpu_cid->cid);
if (mm_cid_is_valid(cid)) {
mm_cid_snapshot_time(rq, mm);
return cid;
}
if (mm_cid_is_lazy_put(cid)) {
if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
cid = __mm_cid_get(rq, mm);
__this_cpu_write(pcpu_cid->cid, cid);
return cid;
}
static inline void switch_mm_cid(struct rq *rq,
struct task_struct *prev,
struct task_struct *next)
{
/*
* Provide a memory barrier between rq->curr store and load of
* {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
*
* Should be adapted if context_switch() is modified.
*/
if (!next->mm) { // to kernel
/*
* user -> kernel transition does not guarantee a barrier, but
* we can use the fact that it performs an atomic operation in
* mmgrab().
*/
if (prev->mm) // from user
smp_mb__after_mmgrab();
/*
* kernel -> kernel transition does not change rq->curr->mm
* state. It stays NULL.
*/
} else { // to user
/*
* kernel -> user transition does not provide a barrier
* between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
* Provide it here.
*/
if (!prev->mm) // from kernel
smp_mb();
/*
* user -> user transition guarantees a memory barrier through
* switch_mm() when current->mm changes. If current->mm is
* unchanged, no barrier is needed.
*/
}
if (prev->mm_cid_active) {
if (next->mm_cid_active && next->mm == prev->mm) {
/*
* Context switch between threads in same mm, hand over
* the mm_cid from prev to next.
*/
next->mm_cid = prev->mm_cid;
prev->mm_cid = -1;
return;
}
mm_cid_put(prev->mm, prev->mm_cid);
mm_cid_snapshot_time(rq, prev->mm);
mm_cid_put_lazy(prev);
prev->mm_cid = -1;
}
if (next->mm_cid_active)
next->mm_cid = mm_cid_get(next->mm);
next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
}
#else
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif
#endif /* _KERNEL_SCHED_SCHED_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment