Commit 24af98c4 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar:
 "So we have a laundry list of locking subsystem changes:

   - continuing barrier API and code improvements

   - futex enhancements

   - atomics API improvements

   - pvqspinlock enhancements: in particular lock stealing and adaptive
     spinning

   - qspinlock micro-enhancements"

* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op
  futex: Cleanup the goto confusion in requeue_pi()
  futex: Remove pointless put_pi_state calls in requeue()
  futex: Document pi_state refcounting in requeue code
  futex: Rename free_pi_state() to put_pi_state()
  futex: Drop refcount if requeue_pi() acquired the rtmutex
  locking/barriers, arch: Remove ambiguous statement in the smp_store_mb() documentation
  lcoking/barriers, arch: Use smp barriers in smp_store_release()
  locking/cmpxchg, arch: Remove tas() definitions
  locking/pvqspinlock: Queue node adaptive spinning
  locking/pvqspinlock: Allow limited lock stealing
  locking/pvqspinlock: Collect slowpath lock statistics
  sched/core, locking: Document Program-Order guarantees
  locking, sched: Introduce smp_cond_acquire() and use it
  locking/pvqspinlock, x86: Optimize the PV unlock code path
  locking/qspinlock: Avoid redundant read of next pointer
  locking/qspinlock: Prefetch the next node cacheline
  locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg()
  atomics: Add test for atomic operations with _relaxed variants
parents 9061cbe6 337f1304
...@@ -1673,8 +1673,8 @@ There are some more advanced barrier functions: ...@@ -1673,8 +1673,8 @@ There are some more advanced barrier functions:
(*) smp_store_mb(var, value) (*) smp_store_mb(var, value)
This assigns the value to the variable and then inserts a full memory This assigns the value to the variable and then inserts a full memory
barrier after it, depending on the function. It isn't guaranteed to barrier after it. It isn't guaranteed to insert anything more than a
insert anything more than a compiler barrier in a UP compilation. compiler barrier in a UP compilation.
(*) smp_mb__before_atomic(); (*) smp_mb__before_atomic();
......
...@@ -128,6 +128,5 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, ...@@ -128,6 +128,5 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
#endif /* !CONFIG_SMP */ #endif /* !CONFIG_SMP */
#define xchg(ptr, x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) #define xchg(ptr, x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
#define tas(ptr) ((void)xchg((ptr), 1))
#endif /* __ARCH_BLACKFIN_CMPXCHG__ */ #endif /* __ARCH_BLACKFIN_CMPXCHG__ */
...@@ -47,8 +47,6 @@ static inline unsigned int __xchg(unsigned int x, volatile void *ptr, int size) ...@@ -47,8 +47,6 @@ static inline unsigned int __xchg(unsigned int x, volatile void *ptr, int size)
#define xchg(ptr, x) \ #define xchg(ptr, x) \
((__typeof__(*(ptr)))__xchg((unsigned int)(x), (void *) (ptr), \ ((__typeof__(*(ptr)))__xchg((unsigned int)(x), (void *) (ptr), \
sizeof(*(ptr)))) sizeof(*(ptr))))
#define tas(ptr) xchg((ptr), 1)
#include <asm-generic/cmpxchg-local.h> #include <asm-generic/cmpxchg-local.h>
......
...@@ -69,8 +69,6 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v); ...@@ -69,8 +69,6 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v);
#endif #endif
#define tas(ptr) (xchg((ptr), 1))
/*****************************************************************************/ /*****************************************************************************/
/* /*
* compare and conditionally exchange value with memory * compare and conditionally exchange value with memory
......
...@@ -77,7 +77,7 @@ do { \ ...@@ -77,7 +77,7 @@ do { \
___p1; \ ___p1; \
}) })
#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
/* /*
* The group barrier in front of the rsm & ssm are necessary to ensure * The group barrier in front of the rsm & ssm are necessary to ensure
......
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
#define rmb() __asm__ __volatile__ ("sync" : : : "memory") #define rmb() __asm__ __volatile__ ("sync" : : : "memory")
#define wmb() __asm__ __volatile__ ("sync" : : : "memory") #define wmb() __asm__ __volatile__ ("sync" : : : "memory")
#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
#ifdef __SUBARCH_HAS_LWSYNC #ifdef __SUBARCH_HAS_LWSYNC
# define SMPWMB LWSYNC # define SMPWMB LWSYNC
......
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
#define smp_mb__before_atomic() smp_mb() #define smp_mb__before_atomic() smp_mb()
#define smp_mb__after_atomic() smp_mb() #define smp_mb__after_atomic() smp_mb()
#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
#define smp_store_release(p, v) \ #define smp_store_release(p, v) \
do { \ do { \
......
...@@ -127,8 +127,6 @@ long long _atomic64_cmpxchg(long long *v, long long o, long long n); ...@@ -127,8 +127,6 @@ long long _atomic64_cmpxchg(long long *v, long long o, long long n);
#endif #endif
#define tas(ptr) xchg((ptr), 1)
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
#endif /* _ASM_TILE_CMPXCHG_H */ #endif /* _ASM_TILE_CMPXCHG_H */
...@@ -687,6 +687,14 @@ config PARAVIRT_SPINLOCKS ...@@ -687,6 +687,14 @@ config PARAVIRT_SPINLOCKS
If you are unsure how to answer this question, answer Y. If you are unsure how to answer this question, answer Y.
config QUEUED_LOCK_STAT
bool "Paravirt queued spinlock statistics"
depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
---help---
Enable the collection of statistical data on the slowpath
behavior of paravirtualized queued spinlocks and report
them on debugfs.
source "arch/x86/xen/Kconfig" source "arch/x86/xen/Kconfig"
config KVM_GUEST config KVM_GUEST
......
#ifndef __ASM_QSPINLOCK_PARAVIRT_H #ifndef __ASM_QSPINLOCK_PARAVIRT_H
#define __ASM_QSPINLOCK_PARAVIRT_H #define __ASM_QSPINLOCK_PARAVIRT_H
/*
* For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
* registers. For i386, however, only 1 32-bit register needs to be saved
* and restored. So an optimized version of __pv_queued_spin_unlock() is
* hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
*/
#ifdef CONFIG_64BIT
PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
#define __pv_queued_spin_unlock __pv_queued_spin_unlock
#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
/*
* Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
* which combines the registers saving trunk and the body of the following
* C code:
*
* void __pv_queued_spin_unlock(struct qspinlock *lock)
* {
* struct __qspinlock *l = (void *)lock;
* u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
*
* if (likely(lockval == _Q_LOCKED_VAL))
* return;
* pv_queued_spin_unlock_slowpath(lock, lockval);
* }
*
* For x86-64,
* rdi = lock (first argument)
* rsi = lockval (second argument)
* rdx = internal variable (set to 0)
*/
asm (".pushsection .text;"
".globl " PV_UNLOCK ";"
".align 4,0x90;"
PV_UNLOCK ": "
"push %rdx;"
"mov $0x1,%eax;"
"xor %edx,%edx;"
"lock cmpxchg %dl,(%rdi);"
"cmp $0x1,%al;"
"jne .slowpath;"
"pop %rdx;"
"ret;"
".slowpath: "
"push %rsi;"
"movzbl %al,%esi;"
"call " PV_UNLOCK_SLOWPATH ";"
"pop %rsi;"
"pop %rdx;"
"ret;"
".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
".popsection");
#else /* CONFIG_64BIT */
extern void __pv_queued_spin_unlock(struct qspinlock *lock);
PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
#endif /* CONFIG_64BIT */
#endif #endif
...@@ -93,7 +93,7 @@ ...@@ -93,7 +93,7 @@
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#ifndef smp_store_mb #ifndef smp_store_mb
#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) #define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
#endif #endif
#ifndef smp_mb__before_atomic #ifndef smp_mb__before_atomic
......
...@@ -12,8 +12,9 @@ ...@@ -12,8 +12,9 @@
* GNU General Public License for more details. * GNU General Public License for more details.
* *
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
* *
* Authors: Waiman Long <waiman.long@hp.com> * Authors: Waiman Long <waiman.long@hpe.com>
*/ */
#ifndef __ASM_GENERIC_QSPINLOCK_H #ifndef __ASM_GENERIC_QSPINLOCK_H
#define __ASM_GENERIC_QSPINLOCK_H #define __ASM_GENERIC_QSPINLOCK_H
...@@ -62,7 +63,7 @@ static __always_inline int queued_spin_is_contended(struct qspinlock *lock) ...@@ -62,7 +63,7 @@ static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
static __always_inline int queued_spin_trylock(struct qspinlock *lock) static __always_inline int queued_spin_trylock(struct qspinlock *lock)
{ {
if (!atomic_read(&lock->val) && if (!atomic_read(&lock->val) &&
(atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0)) (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0))
return 1; return 1;
return 0; return 0;
} }
...@@ -77,7 +78,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock) ...@@ -77,7 +78,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
{ {
u32 val; u32 val;
val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL); val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
if (likely(val == 0)) if (likely(val == 0))
return; return;
queued_spin_lock_slowpath(lock, val); queued_spin_lock_slowpath(lock, val);
...@@ -93,7 +94,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock) ...@@ -93,7 +94,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
/* /*
* smp_mb__before_atomic() in order to guarantee release semantics * smp_mb__before_atomic() in order to guarantee release semantics
*/ */
smp_mb__before_atomic_dec(); smp_mb__before_atomic();
atomic_sub(_Q_LOCKED_VAL, &lock->val); atomic_sub(_Q_LOCKED_VAL, &lock->val);
} }
#endif #endif
......
...@@ -299,6 +299,23 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s ...@@ -299,6 +299,23 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
__u.__val; \ __u.__val; \
}) })
/**
* smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
* @cond: boolean expression to wait for
*
* Equivalent to using smp_load_acquire() on the condition variable but employs
* the control dependency of the wait to reduce the barrier on many platforms.
*
* The control dependency provides a LOAD->STORE order, the additional RMB
* provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
* aka. ACQUIRE.
*/
#define smp_cond_acquire(cond) do { \
while (!(cond)) \
cpu_relax(); \
smp_rmb(); /* ctrl + rmb := acquire */ \
} while (0)
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
......
...@@ -725,9 +725,12 @@ static struct futex_pi_state * alloc_pi_state(void) ...@@ -725,9 +725,12 @@ static struct futex_pi_state * alloc_pi_state(void)
} }
/* /*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
*
* Must be called with the hb lock held. * Must be called with the hb lock held.
*/ */
static void free_pi_state(struct futex_pi_state *pi_state) static void put_pi_state(struct futex_pi_state *pi_state)
{ {
if (!pi_state) if (!pi_state)
return; return;
...@@ -1706,31 +1709,35 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ...@@ -1706,31 +1709,35 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
* exist yet, look it up one more time to ensure we have a * exist yet, look it up one more time to ensure we have a
* reference to it. If the lock was taken, ret contains the * reference to it. If the lock was taken, ret contains the
* vpid of the top waiter task. * vpid of the top waiter task.
* If the lock was not taken, we have pi_state and an initial
* refcount on it. In case of an error we have nothing.
*/ */
if (ret > 0) { if (ret > 0) {
WARN_ON(pi_state); WARN_ON(pi_state);
drop_count++; drop_count++;
task_count++; task_count++;
/* /*
* If we acquired the lock, then the user * If we acquired the lock, then the user space value
* space value of uaddr2 should be vpid. It * of uaddr2 should be vpid. It cannot be changed by
* cannot be changed by the top waiter as it * the top waiter as it is blocked on hb2 lock if it
* is blocked on hb2 lock if it tries to do * tries to do so. If something fiddled with it behind
* so. If something fiddled with it behind our * our back the pi state lookup might unearth it. So
* back the pi state lookup might unearth * we rather use the known value than rereading and
* it. So we rather use the known value than * handing potential crap to lookup_pi_state.
* rereading and handing potential crap to *
* lookup_pi_state. * If that call succeeds then we have pi_state and an
* initial refcount on it.
*/ */
ret = lookup_pi_state(ret, hb2, &key2, &pi_state); ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
} }
switch (ret) { switch (ret) {
case 0: case 0:
/* We hold a reference on the pi state. */
break; break;
/* If the above failed, then pi_state is NULL */
case -EFAULT: case -EFAULT:
free_pi_state(pi_state);
pi_state = NULL;
double_unlock_hb(hb1, hb2); double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2); hb_waiters_dec(hb2);
put_futex_key(&key2); put_futex_key(&key2);
...@@ -1746,8 +1753,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ...@@ -1746,8 +1753,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
* exit to complete. * exit to complete.
* - The user space value changed. * - The user space value changed.
*/ */
free_pi_state(pi_state);
pi_state = NULL;
double_unlock_hb(hb1, hb2); double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2); hb_waiters_dec(hb2);
put_futex_key(&key2); put_futex_key(&key2);
...@@ -1801,30 +1806,58 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, ...@@ -1801,30 +1806,58 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
* of requeue_pi if we couldn't acquire the lock atomically. * of requeue_pi if we couldn't acquire the lock atomically.
*/ */
if (requeue_pi) { if (requeue_pi) {
/* Prepare the waiter to take the rt_mutex. */ /*
* Prepare the waiter to take the rt_mutex. Take a
* refcount on the pi_state and store the pointer in
* the futex_q object of the waiter.
*/
atomic_inc(&pi_state->refcount); atomic_inc(&pi_state->refcount);
this->pi_state = pi_state; this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter, this->rt_waiter,
this->task); this->task);
if (ret == 1) { if (ret == 1) {
/* We got the lock. */ /*
* We got the lock. We do neither drop the
* refcount on pi_state nor clear
* this->pi_state because the waiter needs the
* pi_state for cleaning up the user space
* value. It will drop the refcount after
* doing so.
*/
requeue_pi_wake_futex(this, &key2, hb2); requeue_pi_wake_futex(this, &key2, hb2);
drop_count++; drop_count++;
continue; continue;
} else if (ret) { } else if (ret) {
/* -EDEADLK */ /*
* rt_mutex_start_proxy_lock() detected a
* potential deadlock when we tried to queue
* that waiter. Drop the pi_state reference
* which we took above and remove the pointer
* to the state from the waiters futex_q
* object.
*/
this->pi_state = NULL; this->pi_state = NULL;
free_pi_state(pi_state); put_pi_state(pi_state);
goto out_unlock; /*
* We stop queueing more waiters and let user
* space deal with the mess.
*/
break;
} }
} }
requeue_futex(this, hb1, hb2, &key2); requeue_futex(this, hb1, hb2, &key2);
drop_count++; drop_count++;
} }
/*
* We took an extra initial reference to the pi_state either
* in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
* need to drop it here again.
*/
put_pi_state(pi_state);
out_unlock: out_unlock:
free_pi_state(pi_state);
double_unlock_hb(hb1, hb2); double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q); wake_up_q(&wake_q);
hb_waiters_dec(hb2); hb_waiters_dec(hb2);
...@@ -1973,7 +2006,7 @@ static void unqueue_me_pi(struct futex_q *q) ...@@ -1973,7 +2006,7 @@ static void unqueue_me_pi(struct futex_q *q)
__unqueue_futex(q); __unqueue_futex(q);
BUG_ON(!q->pi_state); BUG_ON(!q->pi_state);
free_pi_state(q->pi_state); put_pi_state(q->pi_state);
q->pi_state = NULL; q->pi_state = NULL;
spin_unlock(q->lock_ptr); spin_unlock(q->lock_ptr);
...@@ -2755,6 +2788,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, ...@@ -2755,6 +2788,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) { if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr); spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current); ret = fixup_pi_state_owner(uaddr2, &q, current);
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
*/
put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr); spin_unlock(q.lock_ptr);
} }
} else { } else {
...@@ -3046,7 +3084,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ...@@ -3046,7 +3084,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (op & FUTEX_CLOCK_REALTIME) { if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT; flags |= FLAGS_CLOCKRT;
if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS; return -ENOSYS;
} }
......
...@@ -14,8 +14,9 @@ ...@@ -14,8 +14,9 @@
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014 Red Hat, Inc. * (C) Copyright 2013-2014 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp. * (C) Copyright 2015 Intel Corp.
* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
* *
* Authors: Waiman Long <waiman.long@hp.com> * Authors: Waiman Long <waiman.long@hpe.com>
* Peter Zijlstra <peterz@infradead.org> * Peter Zijlstra <peterz@infradead.org>
*/ */
...@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) ...@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{ {
struct __qspinlock *l = (void *)lock; struct __qspinlock *l = (void *)lock;
return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; /*
* Use release semantics to make sure that the MCS node is properly
* initialized before changing the tail code.
*/
return (u32)xchg_release(&l->tail,
tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
} }
#else /* _Q_PENDING_BITS == 8 */ #else /* _Q_PENDING_BITS == 8 */
...@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) ...@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) { for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail; new = (val & _Q_LOCKED_PENDING_MASK) | tail;
old = atomic_cmpxchg(&lock->val, val, new); /*
* Use release semantics to make sure that the MCS node is
* properly initialized before changing the tail code.
*/
old = atomic_cmpxchg_release(&lock->val, val, new);
if (old == val) if (old == val)
break; break;
...@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock) ...@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/ */
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock, static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { } struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_head(struct qspinlock *lock, static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
struct mcs_spinlock *node) { } struct mcs_spinlock *node)
{ return 0; }
#define pv_enabled() false #define pv_enabled() false
#define pv_init_node __pv_init_node #define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node #define pv_wait_node __pv_wait_node
#define pv_kick_node __pv_kick_node #define pv_kick_node __pv_kick_node
#define pv_wait_head __pv_wait_head #define pv_wait_head_or_lock __pv_wait_head_or_lock
#ifdef CONFIG_PARAVIRT_SPINLOCKS #ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
...@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) ...@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (val == new) if (val == new)
new |= _Q_PENDING_VAL; new |= _Q_PENDING_VAL;
old = atomic_cmpxchg(&lock->val, val, new); /*
* Acquire semantic is required here as the function may
* return immediately if the lock was free.
*/
old = atomic_cmpxchg_acquire(&lock->val, val, new);
if (old == val) if (old == val)
break; break;
...@@ -382,6 +398,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) ...@@ -382,6 +398,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* p,*,* -> n,*,* * p,*,* -> n,*,*
*/ */
old = xchg_tail(lock, tail); old = xchg_tail(lock, tail);
next = NULL;
/* /*
* if there was a previous node; link it and wait until reaching the * if there was a previous node; link it and wait until reaching the
...@@ -391,8 +408,18 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) ...@@ -391,8 +408,18 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
prev = decode_tail(old); prev = decode_tail(old);
WRITE_ONCE(prev->next, node); WRITE_ONCE(prev->next, node);
pv_wait_node(node); pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked); arch_mcs_spin_lock_contended(&node->locked);
/*
* While waiting for the MCS lock, the next pointer may have
* been set by another lock waiter. We optimistically load
* the next pointer & prefetch the cacheline for writing
* to reduce latency in the upcoming MCS unlock operation.
*/
next = READ_ONCE(node->next);
if (next)
prefetchw(next);
} }
/* /*
...@@ -406,11 +433,22 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) ...@@ -406,11 +433,22 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* sequentiality; this is because the set_locked() function below * sequentiality; this is because the set_locked() function below
* does not imply a full barrier. * does not imply a full barrier.
* *
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
* smp_load_acquire() call. As the next PV queue head hasn't been
* designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
*
* If PV isn't active, 0 will be returned instead.
*
*/ */
pv_wait_head(lock, node); if ((val = pv_wait_head_or_lock(lock, node)))
while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) goto locked;
cpu_relax();
smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
locked:
/* /*
* claim the lock: * claim the lock:
* *
...@@ -422,11 +460,17 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) ...@@ -422,11 +460,17 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* to grab the lock. * to grab the lock.
*/ */
for (;;) { for (;;) {
if (val != tail) { /* In the PV case we might already have _Q_LOCKED_VAL set */
if ((val & _Q_TAIL_MASK) != tail) {
set_locked(lock); set_locked(lock);
break; break;
} }
old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); /*
* The smp_load_acquire() call above has provided the necessary
* acquire semantics required for locking. At most two
* iterations of this loop may be ran.
*/
old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val) if (old == val)
goto release; /* No contention */ goto release; /* No contention */
...@@ -434,10 +478,12 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) ...@@ -434,10 +478,12 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
} }
/* /*
* contended path; wait for next, release. * contended path; wait for next if not observed yet, release.
*/ */
if (!next) {
while (!(next = READ_ONCE(node->next))) while (!(next = READ_ONCE(node->next)))
cpu_relax(); cpu_relax();
}
arch_mcs_spin_unlock_contended(&next->locked); arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next); pv_kick_node(lock, next);
...@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); ...@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#undef pv_init_node #undef pv_init_node
#undef pv_wait_node #undef pv_wait_node
#undef pv_kick_node #undef pv_kick_node
#undef pv_wait_head #undef pv_wait_head_or_lock
#undef queued_spin_lock_slowpath #undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
......
...@@ -22,6 +22,20 @@ ...@@ -22,6 +22,20 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
/*
* Queue Node Adaptive Spinning
*
* A queue node vCPU will stop spinning if the vCPU in the previous node is
* not running. The one lock stealing attempt allowed at slowpath entry
* mitigates the slight slowdown for non-overcommitted guest with this
* aggressive wait-early mechanism.
*
* The status of the previous node will be checked at fixed interval
* controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
* pound on the cacheline of the previous node too heavily.
*/
#define PV_PREV_CHECK_MASK 0xff
/* /*
* Queue node uses: vcpu_running & vcpu_halted. * Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed. * Queue head uses: vcpu_running & vcpu_hashed.
...@@ -40,6 +54,94 @@ struct pv_node { ...@@ -40,6 +54,94 @@ struct pv_node {
u8 state; u8 state;
}; };
/*
* By replacing the regular queued_spin_trylock() with the function below,
* it will be called once when a lock waiter enter the PV slowpath before
* being queued. By allowing one lock stealing attempt here when the pending
* bit is off, it helps to reduce the performance impact of lock waiter
* preemption without the drawback of lock starvation.
*/
#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
}
/*
* The pending bit is used by the queue head vCPU to indicate that it
* is actively spinning on the lock and no lock stealing is allowed.
*/
#if _Q_PENDING_BITS == 8
static __always_inline void set_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
WRITE_ONCE(l->pending, 1);
}
static __always_inline void clear_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
WRITE_ONCE(l->pending, 0);
}
/*
* The pending bit check in pv_queued_spin_steal_lock() isn't a memory
* barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
* just to be sure that it will get it.
*/
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
return !READ_ONCE(l->locked) &&
(cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
== _Q_PENDING_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
{
atomic_set_mask(_Q_PENDING_VAL, &lock->val);
}
static __always_inline void clear_pending(struct qspinlock *lock)
{
atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
}
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
int val = atomic_read(&lock->val);
for (;;) {
int old, new;
if (val & _Q_LOCKED_MASK)
break;
/*
* Try to clear pending bit & set locked bit
*/
old = val;
new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
val = atomic_cmpxchg(&lock->val, old, new);
if (val == old)
return 1;
}
return 0;
}
#endif /* _Q_PENDING_BITS == 8 */
/*
* Include queued spinlock statistics code
*/
#include "qspinlock_stat.h"
/* /*
* Lock and MCS node addresses hash table for fast lookup * Lock and MCS node addresses hash table for fast lookup
* *
...@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) ...@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{ {
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he; struct pv_hash_entry *he;
int hopcnt = 0;
for_each_hash_entry(he, offset, hash) { for_each_hash_entry(he, offset, hash) {
hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) { if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node); WRITE_ONCE(he->node, node);
qstat_hop(hopcnt);
return &he->lock; return &he->lock;
} }
} }
...@@ -143,6 +248,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) ...@@ -143,6 +248,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
BUG(); BUG();
} }
/*
* Return true if when it is time to check the previous node which is not
* in a running state.
*/
static inline bool
pv_wait_early(struct pv_node *prev, int loop)
{
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
return READ_ONCE(prev->state) != vcpu_running;
}
/* /*
* Initialize the PV part of the mcs_spinlock node. * Initialize the PV part of the mcs_spinlock node.
*/ */
...@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node) ...@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf. * behalf.
*/ */
static void pv_wait_node(struct mcs_spinlock *node) static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{ {
struct pv_node *pn = (struct pv_node *)node; struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
int waitcnt = 0;
int loop; int loop;
bool wait_early;
for (;;) { /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
for (loop = SPIN_THRESHOLD; loop; loop--) { for (;; waitcnt++) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked)) if (READ_ONCE(node->locked))
return; return;
if (pv_wait_early(pp, loop)) {
wait_early = true;
break;
}
cpu_relax(); cpu_relax();
} }
...@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node) ...@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
*/ */
smp_store_mb(pn->state, vcpu_halted); smp_store_mb(pn->state, vcpu_halted);
if (!READ_ONCE(node->locked)) if (!READ_ONCE(node->locked)) {
qstat_inc(qstat_pv_wait_node, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted); pv_wait(&pn->state, vcpu_halted);
}
/* /*
* If pv_kick_node() changed us to vcpu_hashed, retain that value * If pv_kick_node() changed us to vcpu_hashed, retain that
* so that pv_wait_head() knows to not also try to hash this lock. * value so that pv_wait_head_or_lock() knows to not also try
* to hash this lock.
*/ */
cmpxchg(&pn->state, vcpu_halted, vcpu_running); cmpxchg(&pn->state, vcpu_halted, vcpu_running);
...@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node) ...@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* So it is better to spin for a while in the hope that the * So it is better to spin for a while in the hope that the
* MCS lock will be released soon. * MCS lock will be released soon.
*/ */
qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
} }
/* /*
...@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node) ...@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
/* /*
* Called after setting next->locked = 1 when we're the lock owner. * Called after setting next->locked = 1 when we're the lock owner.
* *
* Instead of waking the waiters stuck in pv_wait_node() advance their state such * Instead of waking the waiters stuck in pv_wait_node() advance their state
* that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. * such that they're waiting in pv_wait_head_or_lock(), this avoids a
* wake/sleep cycle.
*/ */
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{ {
...@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) ...@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
} }
/* /*
* Wait for l->locked to become clear; halt the vcpu after a short spin. * Wait for l->locked to become clear and acquire the lock;
* halt the vcpu after a short spin.
* __pv_queued_spin_unlock() will wake us. * __pv_queued_spin_unlock() will wake us.
*
* The current value of the lock will be returned for additional processing.
*/ */
static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) static u32
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{ {
struct pv_node *pn = (struct pv_node *)node; struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock; struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL; struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop; int loop;
/* /*
...@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) ...@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
if (READ_ONCE(pn->state) == vcpu_hashed) if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1; lp = (struct qspinlock **)1;
for (;;) { for (;; waitcnt++) {
/*
* Set correct vCPU state to be used by queue node wait-early
* mechanism.
*/
WRITE_ONCE(pn->state, vcpu_running);
/*
* Set the pending bit in the active lock spinning loop to
* disable lock stealing before attempting to acquire the lock.
*/
set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) { for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked)) if (trylock_clear_pending(lock))
return; goto gotlock;
cpu_relax(); cpu_relax();
} }
clear_pending(lock);
if (!lp) { /* ONCE */ if (!lp) { /* ONCE */
lp = pv_hash(lock, pn); lp = pv_hash(lock, pn);
...@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) ...@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
* *
* Matches the smp_rmb() in __pv_queued_spin_unlock(). * Matches the smp_rmb() in __pv_queued_spin_unlock().
*/ */
if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
/* /*
* The lock is free and _Q_SLOW_VAL has never * The lock was free and now we own the lock.
* been set. Therefore we need to unhash before * Change the lock value back to _Q_LOCKED_VAL
* getting the lock. * and unhash the table.
*/ */
WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL); WRITE_ONCE(*lp, NULL);
return; goto gotlock;
} }
} }
WRITE_ONCE(pn->state, vcpu_halted);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL); pv_wait(&l->locked, _Q_SLOW_VAL);
/* /*
* The unlocker should have freed the lock before kicking the * The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious * CPU. So if the lock is still not free, it is a spurious
* wakeup and so the vCPU should wait again after spinning for * wakeup or another vCPU has stolen the lock. The current
* a while. * vCPU should spin again.
*/ */
qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
} }
/* /*
* Lock is unlocked now; the caller will acquire it without waiting. * The cmpxchg() or xchg() call before coming here provides the
* As with pv_wait_node() we rely on the caller to do a load-acquire * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
* for us. * here is to indicate to the compiler that the value will always
* be nozero to enable better code optimization.
*/ */
gotlock:
return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
} }
/* /*
* PV version of the unlock function to be used in stead of * PV versions of the unlock fastpath and slowpath functions to be used
* queued_spin_unlock(). * instead of queued_spin_unlock().
*/ */
__visible void __pv_queued_spin_unlock(struct qspinlock *lock) __visible void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{ {
struct __qspinlock *l = (void *)lock; struct __qspinlock *l = (void *)lock;
struct pv_node *node; struct pv_node *node;
u8 locked;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
if (unlikely(locked != _Q_SLOW_VAL)) { if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent, WARN(!debug_locks_silent,
...@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) ...@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* so we need a barrier to order the read of the node data in * so we need a barrier to order the read of the node data in
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL. * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
* *
* Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
*/ */
smp_rmb(); smp_rmb();
...@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) ...@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* vCPU is harmless other than the additional latency in completing * vCPU is harmless other than the additional latency in completing
* the unlock. * the unlock.
*/ */
qstat_inc(qstat_pv_kick_unlock, true);
pv_kick(node->cpu); pv_kick(node->cpu);
} }
/* /*
* Include the architecture specific callee-save thunk of the * Include the architecture specific callee-save thunk of the
* __pv_queued_spin_unlock(). This thunk is put together with * __pv_queued_spin_unlock(). This thunk is put together with
* __pv_queued_spin_unlock() near the top of the file to make sure * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
* that the callee-save thunk and the real unlock function are close * function close to each other sharing consecutive instruction cachelines.
* to each other sharing consecutive instruction cachelines. * Alternatively, architecture specific version of __pv_queued_spin_unlock()
* can be defined.
*/ */
#include <asm/qspinlock_paravirt.h> #include <asm/qspinlock_paravirt.h>
#ifndef __pv_queued_spin_unlock
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
u8 locked;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
__pv_queued_spin_unlock_slowpath(lock, locked);
}
#endif /* __pv_queued_spin_unlock */
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* Authors: Waiman Long <waiman.long@hpe.com>
*/
/*
* When queued spinlock statistical counters are enabled, the following
* debugfs files will be created for reporting the counter values:
*
* <debugfs>/qlockstat/
* pv_hash_hops - average # of hops per hashing operation
* pv_kick_unlock - # of vCPU kicks issued at unlock time
* pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
* pv_latency_kick - average latency (ns) of vCPU kick operation
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
* pv_lock_stealing - # of lock stealing operations
* pv_spurious_wakeup - # of spurious wakeups
* pv_wait_again - # of vCPU wait's that happened after a vCPU kick
* pv_wait_early - # of early vCPU wait's
* pv_wait_head - # of vCPU wait's at the queue head
* pv_wait_node - # of vCPU wait's at a non-head queue node
*
* Writing to the "reset_counters" file will reset all the above counter
* values.
*
* These statistical counters are implemented as per-cpu variables which are
* summed and computed whenever the corresponding debugfs files are read. This
* minimizes added overhead making the counters usable even in a production
* environment.
*
* There may be slight difference between pv_kick_wake and pv_kick_unlock.
*/
enum qlock_stats {
qstat_pv_hash_hops,
qstat_pv_kick_unlock,
qstat_pv_kick_wake,
qstat_pv_latency_kick,
qstat_pv_latency_wake,
qstat_pv_lock_stealing,
qstat_pv_spurious_wakeup,
qstat_pv_wait_again,
qstat_pv_wait_early,
qstat_pv_wait_head,
qstat_pv_wait_node,
qstat_num, /* Total number of statistical counters */
qstat_reset_cnts = qstat_num,
};
#ifdef CONFIG_QUEUED_LOCK_STAT
/*
* Collect pvqspinlock statistics
*/
#include <linux/debugfs.h>
#include <linux/sched.h>
#include <linux/fs.h>
static const char * const qstat_names[qstat_num + 1] = {
[qstat_pv_hash_hops] = "pv_hash_hops",
[qstat_pv_kick_unlock] = "pv_kick_unlock",
[qstat_pv_kick_wake] = "pv_kick_wake",
[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
[qstat_pv_latency_kick] = "pv_latency_kick",
[qstat_pv_latency_wake] = "pv_latency_wake",
[qstat_pv_lock_stealing] = "pv_lock_stealing",
[qstat_pv_wait_again] = "pv_wait_again",
[qstat_pv_wait_early] = "pv_wait_early",
[qstat_pv_wait_head] = "pv_wait_head",
[qstat_pv_wait_node] = "pv_wait_node",
[qstat_reset_cnts] = "reset_counters",
};
/*
* Per-cpu counters
*/
static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
static DEFINE_PER_CPU(u64, pv_kick_time);
/*
* Function to read and return the qlock statistical counter values
*
* The following counters are handled specially:
* 1. qstat_pv_latency_kick
* Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
* 2. qstat_pv_latency_wake
* Average wake latency (ns) = pv_latency_wake/pv_kick_wake
* 3. qstat_pv_hash_hops
* Average hops/hash = pv_hash_hops/pv_kick_unlock
*/
static ssize_t qstat_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[64];
int cpu, counter, len;
u64 stat = 0, kicks = 0;
/*
* Get the counter ID stored in file->f_inode->i_private
*/
if (!file->f_inode) {
WARN_ON_ONCE(1);
return -EBADF;
}
counter = (long)(file->f_inode->i_private);
if (counter >= qstat_num)
return -EBADF;
for_each_possible_cpu(cpu) {
stat += per_cpu(qstats[counter], cpu);
/*
* Need to sum additional counter for some of them
*/
switch (counter) {
case qstat_pv_latency_kick:
case qstat_pv_hash_hops:
kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
break;
case qstat_pv_latency_wake:
kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
break;
}
}
if (counter == qstat_pv_hash_hops) {
u64 frac;
frac = 100ULL * do_div(stat, kicks);
frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
/*
* Return a X.XX decimal number
*/
len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
} else {
/*
* Round to the nearest ns
*/
if ((counter == qstat_pv_latency_kick) ||
(counter == qstat_pv_latency_wake)) {
stat = 0;
if (kicks)
stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
}
len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
}
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
/*
* Function to handle write request
*
* When counter = reset_cnts, reset all the counter values.
* Since the counter updates aren't atomic, the resetting is done twice
* to make sure that the counters are very likely to be all cleared.
*/
static ssize_t qstat_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
int cpu;
/*
* Get the counter ID stored in file->f_inode->i_private
*/
if (!file->f_inode) {
WARN_ON_ONCE(1);
return -EBADF;
}
if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
return count;
for_each_possible_cpu(cpu) {
int i;
unsigned long *ptr = per_cpu_ptr(qstats, cpu);
for (i = 0 ; i < qstat_num; i++)
WRITE_ONCE(ptr[i], 0);
for (i = 0 ; i < qstat_num; i++)
WRITE_ONCE(ptr[i], 0);
}
return count;
}
/*
* Debugfs data structures
*/
static const struct file_operations fops_qstat = {
.read = qstat_read,
.write = qstat_write,
.llseek = default_llseek,
};
/*
* Initialize debugfs for the qspinlock statistical counters
*/
static int __init init_qspinlock_stat(void)
{
struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
int i;
if (!d_qstat) {
pr_warn("Could not create 'qlockstat' debugfs directory\n");
return 0;
}
/*
* Create the debugfs files
*
* As reading from and writing to the stat files can be slow, only
* root is allowed to do the read/write to limit impact to system
* performance.
*/
for (i = 0; i < qstat_num; i++)
debugfs_create_file(qstat_names[i], 0400, d_qstat,
(void *)(long)i, &fops_qstat);
debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
(void *)(long)qstat_reset_cnts, &fops_qstat);
return 0;
}
fs_initcall(init_qspinlock_stat);
/*
* Increment the PV qspinlock statistical counters
*/
static inline void qstat_inc(enum qlock_stats stat, bool cond)
{
if (cond)
this_cpu_inc(qstats[stat]);
}
/*
* PV hash hop count
*/
static inline void qstat_hop(int hopcnt)
{
this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
}
/*
* Replacement function for pv_kick()
*/
static inline void __pv_kick(int cpu)
{
u64 start = sched_clock();
per_cpu(pv_kick_time, cpu) = start;
pv_kick(cpu);
this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
}
/*
* Replacement function for pv_wait()
*/
static inline void __pv_wait(u8 *ptr, u8 val)
{
u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
*pkick_time = 0;
pv_wait(ptr, val);
if (*pkick_time) {
this_cpu_add(qstats[qstat_pv_latency_wake],
sched_clock() - *pkick_time);
qstat_inc(qstat_pv_kick_wake, true);
}
}
#define pv_kick(c) __pv_kick(c)
#define pv_wait(p, v) __pv_wait(p, v)
/*
* PV unfair trylock count tracking function
*/
static inline int qstat_spin_steal_lock(struct qspinlock *lock)
{
int ret = pv_queued_spin_steal_lock(lock);
qstat_inc(qstat_pv_lock_stealing, ret);
return ret;
}
#undef queued_spin_trylock
#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
#else /* CONFIG_QUEUED_LOCK_STAT */
static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
static inline void qstat_hop(int hopcnt) { }
#endif /* CONFIG_QUEUED_LOCK_STAT */
...@@ -1905,6 +1905,97 @@ static void ttwu_queue(struct task_struct *p, int cpu) ...@@ -1905,6 +1905,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
raw_spin_unlock(&rq->lock); raw_spin_unlock(&rq->lock);
} }
/*
* Notes on Program-Order guarantees on SMP systems.
*
* MIGRATION
*
* The basic program-order guarantee on SMP systems is that when a task [t]
* migrates, all its activity on its old cpu [c0] happens-before any subsequent
* execution on its new cpu [c1].
*
* For migration (of runnable tasks) this is provided by the following means:
*
* A) UNLOCK of the rq(c0)->lock scheduling out task t
* B) migration for t is required to synchronize *both* rq(c0)->lock and
* rq(c1)->lock (if not at the same time, then in that order).
* C) LOCK of the rq(c1)->lock scheduling in task
*
* Transitivity guarantees that B happens after A and C after B.
* Note: we only require RCpc transitivity.
* Note: the cpu doing B need not be c0 or c1
*
* Example:
*
* CPU0 CPU1 CPU2
*
* LOCK rq(0)->lock
* sched-out X
* sched-in Y
* UNLOCK rq(0)->lock
*
* LOCK rq(0)->lock // orders against CPU0
* dequeue X
* UNLOCK rq(0)->lock
*
* LOCK rq(1)->lock
* enqueue X
* UNLOCK rq(1)->lock
*
* LOCK rq(1)->lock // orders against CPU2
* sched-out Z
* sched-in X
* UNLOCK rq(1)->lock
*
*
* BLOCKING -- aka. SLEEP + WAKEUP
*
* For blocking we (obviously) need to provide the same guarantee as for
* migration. However the means are completely different as there is no lock
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
* 2) smp_cond_acquire(!X->on_cpu)
*
* Example:
*
* CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
*
* LOCK rq(0)->lock LOCK X->pi_lock
* dequeue X
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
* smp_cond_acquire(!X->on_cpu);
* X->state = WAKING
* set_task_cpu(X,2)
*
* LOCK rq(2)->lock
* enqueue X
* X->state = RUNNING
* UNLOCK rq(2)->lock
*
* LOCK rq(2)->lock // orders against CPU1
* sched-out Z
* sched-in X
* UNLOCK rq(2)->lock
*
* UNLOCK X->pi_lock
* UNLOCK rq(0)->lock
*
*
* However; for wakeups there is a second guarantee we must provide, namely we
* must observe the state that lead to our wakeup. That is, not only must our
* task observe its own prior state, it must also observe the stores prior to
* its wakeup.
*
* This means that any means of doing remote wakeups must order the CPU doing
* the wakeup against the CPU the task is going to end up running on. This,
* however, is already required for the regular Program-Order guarantee above,
* since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
*
*/
/** /**
* try_to_wake_up - wake up a thread * try_to_wake_up - wake up a thread
* @p: the thread to be awakened * @p: the thread to be awakened
...@@ -1968,19 +2059,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ...@@ -1968,19 +2059,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/* /*
* If the owning (remote) cpu is still in the middle of schedule() with * If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task. * this task as prev, wait until its done referencing the task.
*/
while (p->on_cpu)
cpu_relax();
/*
* Combined with the control dependency above, we have an effective
* smp_load_acquire() without the need for full barriers.
* *
* Pairs with the smp_store_release() in finish_lock_switch(). * Pairs with the smp_store_release() in finish_lock_switch().
* *
* This ensures that tasks getting woken will be fully ordered against * This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order. * their previous state and preserve Program Order.
*/ */
smp_rmb(); smp_cond_acquire(!p->on_cpu);
p->sched_contributes_to_load = !!task_contributes_to_load(p); p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING; p->state = TASK_WAKING;
......
...@@ -1076,7 +1076,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ...@@ -1076,7 +1076,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* In particular, the load of prev->state in finish_task_switch() must * In particular, the load of prev->state in finish_task_switch() must
* happen before this. * happen before this.
* *
* Pairs with the control dependency and rmb in try_to_wake_up(). * Pairs with the smp_cond_acquire() in try_to_wake_up().
*/ */
smp_store_release(&prev->on_cpu, 0); smp_store_release(&prev->on_cpu, 0);
#endif #endif
......
...@@ -27,6 +27,65 @@ do { \ ...@@ -27,6 +27,65 @@ do { \
(unsigned long long)r); \ (unsigned long long)r); \
} while (0) } while (0)
/*
* Test for a atomic operation family,
* @test should be a macro accepting parameters (bit, op, ...)
*/
#define FAMILY_TEST(test, bit, op, args...) \
do { \
test(bit, op, ##args); \
test(bit, op##_acquire, ##args); \
test(bit, op##_release, ##args); \
test(bit, op##_relaxed, ##args); \
} while (0)
#define TEST_RETURN(bit, op, c_op, val) \
do { \
atomic##bit##_set(&v, v0); \
r = v0; \
r c_op val; \
BUG_ON(atomic##bit##_##op(val, &v) != r); \
BUG_ON(atomic##bit##_read(&v) != r); \
} while (0)
#define RETURN_FAMILY_TEST(bit, op, c_op, val) \
do { \
FAMILY_TEST(TEST_RETURN, bit, op, c_op, val); \
} while (0)
#define TEST_ARGS(bit, op, init, ret, expect, args...) \
do { \
atomic##bit##_set(&v, init); \
BUG_ON(atomic##bit##_##op(&v, ##args) != ret); \
BUG_ON(atomic##bit##_read(&v) != expect); \
} while (0)
#define XCHG_FAMILY_TEST(bit, init, new) \
do { \
FAMILY_TEST(TEST_ARGS, bit, xchg, init, init, new, new); \
} while (0)
#define CMPXCHG_FAMILY_TEST(bit, init, new, wrong) \
do { \
FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \
init, init, new, init, new); \
FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \
init, init, init, wrong, new); \
} while (0)
#define INC_RETURN_FAMILY_TEST(bit, i) \
do { \
FAMILY_TEST(TEST_ARGS, bit, inc_return, \
i, (i) + one, (i) + one); \
} while (0)
#define DEC_RETURN_FAMILY_TEST(bit, i) \
do { \
FAMILY_TEST(TEST_ARGS, bit, dec_return, \
i, (i) - one, (i) - one); \
} while (0)
static __init void test_atomic(void) static __init void test_atomic(void)
{ {
int v0 = 0xaaa31337; int v0 = 0xaaa31337;
...@@ -45,6 +104,18 @@ static __init void test_atomic(void) ...@@ -45,6 +104,18 @@ static __init void test_atomic(void)
TEST(, and, &=, v1); TEST(, and, &=, v1);
TEST(, xor, ^=, v1); TEST(, xor, ^=, v1);
TEST(, andnot, &= ~, v1); TEST(, andnot, &= ~, v1);
RETURN_FAMILY_TEST(, add_return, +=, onestwos);
RETURN_FAMILY_TEST(, add_return, +=, -one);
RETURN_FAMILY_TEST(, sub_return, -=, onestwos);
RETURN_FAMILY_TEST(, sub_return, -=, -one);
INC_RETURN_FAMILY_TEST(, v0);
DEC_RETURN_FAMILY_TEST(, v0);
XCHG_FAMILY_TEST(, v0, v1);
CMPXCHG_FAMILY_TEST(, v0, v1, onestwos);
} }
#define INIT(c) do { atomic64_set(&v, c); r = c; } while (0) #define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
...@@ -74,59 +145,26 @@ static __init void test_atomic64(void) ...@@ -74,59 +145,26 @@ static __init void test_atomic64(void)
TEST(64, xor, ^=, v1); TEST(64, xor, ^=, v1);
TEST(64, andnot, &= ~, v1); TEST(64, andnot, &= ~, v1);
INIT(v0); RETURN_FAMILY_TEST(64, add_return, +=, onestwos);
r += onestwos; RETURN_FAMILY_TEST(64, add_return, +=, -one);
BUG_ON(atomic64_add_return(onestwos, &v) != r); RETURN_FAMILY_TEST(64, sub_return, -=, onestwos);
BUG_ON(v.counter != r); RETURN_FAMILY_TEST(64, sub_return, -=, -one);
INIT(v0);
r += -one;
BUG_ON(atomic64_add_return(-one, &v) != r);
BUG_ON(v.counter != r);
INIT(v0);
r -= onestwos;
BUG_ON(atomic64_sub_return(onestwos, &v) != r);
BUG_ON(v.counter != r);
INIT(v0);
r -= -one;
BUG_ON(atomic64_sub_return(-one, &v) != r);
BUG_ON(v.counter != r);
INIT(v0); INIT(v0);
atomic64_inc(&v); atomic64_inc(&v);
r += one; r += one;
BUG_ON(v.counter != r); BUG_ON(v.counter != r);
INIT(v0);
r += one;
BUG_ON(atomic64_inc_return(&v) != r);
BUG_ON(v.counter != r);
INIT(v0); INIT(v0);
atomic64_dec(&v); atomic64_dec(&v);
r -= one; r -= one;
BUG_ON(v.counter != r); BUG_ON(v.counter != r);
INIT(v0); INC_RETURN_FAMILY_TEST(64, v0);
r -= one; DEC_RETURN_FAMILY_TEST(64, v0);
BUG_ON(atomic64_dec_return(&v) != r);
BUG_ON(v.counter != r);
INIT(v0); XCHG_FAMILY_TEST(64, v0, v1);
BUG_ON(atomic64_xchg(&v, v1) != v0); CMPXCHG_FAMILY_TEST(64, v0, v1, v2);
r = v1;
BUG_ON(v.counter != r);
INIT(v0);
BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
r = v1;
BUG_ON(v.counter != r);
INIT(v0);
BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0);
BUG_ON(v.counter != r);
INIT(v0); INIT(v0);
BUG_ON(atomic64_add_unless(&v, one, v0)); BUG_ON(atomic64_add_unless(&v, one, v0));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment