Commit 22db71bc authored by Michael Ellerman's avatar Michael Ellerman

Merge branch 'topic/qspinlock' into next

Merge Nick's powerpc qspinlock implementation. From his cover letter:

This replaces the generic queued spinlock code (like s390 does) with our
own implementation.

Generic PV qspinlock code is causing latency / starvation regressions on
large systems that are resulting in hard lockups reported (mostly in
pathoogical cases). The generic qspinlock code has a number of issues
important for powerpc hardware and hypervisors that aren't easily solved
without changing code that would impact other architectures. Follow
s390's lead and implement our own for now.

Issues for powerpc using generic qspinlocks:
  - The previous lock value should not be loaded with simple loads, and
    need not be passed around from previous loads or cmpxchg results,
    because powerpc uses ll/sc-style atomics which can perform more
    complex operations that do not require this. powerpc implementations
    tend to prefer loads use larx for improved coherency performance.
  - The queueing process should absolutely minimise the number of stores
    to the lock word to reduce exclusive coherency probes, important for
    large system scalability. The pending logic is counter productive
    here.
  - Non-atomic unlock for paravirt locks is important (atomic
    instructions tend to still be more expensive than x86 CPUs).
  - Yielding to the lock owner is important in the oversubscribed
    paravirt case, which requires storing the owner CPU in the lock
    word.
  - More control of lock stealing for the paravirt case is important to
    keep latency down on large systems.
  - The lock acquisition operation should always be made with a special
    variant of atomic instructions with the lock hint bit set,
    including (especially) in the queueing paths. This is more a matter
    of adding more arch lock helpers so not an insurmountable problem
    for generic code.
parents 94ba4f2c 0b219984
......@@ -99,7 +99,7 @@ config LOCKDEP_SUPPORT
config GENERIC_LOCKBREAK
bool
default y
depends on SMP && PREEMPTION
depends on SMP && PREEMPTION && !PPC_QUEUED_SPINLOCKS
config GENERIC_HWEIGHT
bool
......@@ -158,7 +158,6 @@ config PPC
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS
select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
......
......@@ -2,83 +2,173 @@
#ifndef _ASM_POWERPC_QSPINLOCK_H
#define _ASM_POWERPC_QSPINLOCK_H
#include <asm-generic/qspinlock_types.h>
#include <linux/compiler.h>
#include <asm/qspinlock_types.h>
#include <asm/paravirt.h>
#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */
#ifdef CONFIG_PPC64
/*
* Use the EH=1 hint for accesses that result in the lock being acquired.
* The hardware is supposed to optimise this pattern by holding the lock
* cacheline longer, and releasing when a store to the same memory (the
* unlock) is performed.
*/
#define _Q_SPIN_EH_HINT 1
#else
#define _Q_SPIN_EH_HINT 0
#endif
#ifdef CONFIG_PARAVIRT_SPINLOCKS
extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_queued_spin_unlock(struct qspinlock *lock);
/*
* The trylock itself may steal. This makes trylocks slightly stronger, and
* makes locks slightly more efficient when stealing.
*
* This is compile-time, so if true then there may always be stealers, so the
* nosteal paths become unused.
*/
#define _Q_SPIN_TRY_LOCK_STEAL 1
static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
if (!is_shared_processor())
native_queued_spin_lock_slowpath(lock, val);
else
__pv_queued_spin_lock_slowpath(lock, val);
}
/*
* Put a speculation barrier after testing the lock/node and finding it
* busy. Try to prevent pointless speculation in slow paths.
*
* Slows down the lockstorm microbenchmark with no stealing, where locking
* is purely FIFO through the queue. May have more benefit in real workload
* where speculating into the wrong place could have a greater cost.
*/
#define _Q_SPIN_SPEC_BARRIER 0
#define queued_spin_unlock queued_spin_unlock
static inline void queued_spin_unlock(struct qspinlock *lock)
{
if (!is_shared_processor())
smp_store_release(&lock->locked, 0);
else
__pv_queued_spin_unlock(lock);
}
#ifdef CONFIG_PPC64
/*
* Execute a miso instruction after passing the MCS lock ownership to the
* queue head. Miso is intended to make stores visible to other CPUs sooner.
*
* This seems to make the lockstorm microbenchmark nospin test go slightly
* faster on POWER10, but disable for now.
*/
#define _Q_SPIN_MISO 0
#else
#define _Q_SPIN_MISO 0
#endif
#ifdef CONFIG_PPC64
/*
* This executes miso after an unlock of the lock word, having ownership
* pass to the next CPU sooner. This will slow the uncontended path to some
* degree. Not evidence it helps yet.
*/
#define _Q_SPIN_MISO_UNLOCK 0
#else
extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
#define _Q_SPIN_MISO_UNLOCK 0
#endif
static __always_inline void queued_spin_lock(struct qspinlock *lock)
/*
* Seems to slow down lockstorm microbenchmark, suspect queue node just
* has to become shared again right afterwards when its waiter spins on
* the lock field.
*/
#define _Q_SPIN_PREFETCH_NEXT 0
static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
{
u32 val = 0;
return READ_ONCE(lock->val);
}
if (likely(arch_atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
return;
static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
{
return !lock.val;
}
queued_spin_lock_slowpath(lock, val);
static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
{
return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK);
}
#define queued_spin_lock queued_spin_lock
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define SPIN_THRESHOLD (1<<15) /* not tuned */
static __always_inline u32 queued_spin_encode_locked_val(void)
{
/* XXX: make this use lock value in paca like simple spinlocks? */
return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET);
}
static __always_inline void pv_wait(u8 *ptr, u8 val)
static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock)
{
if (*ptr != val)
return;
yield_to_any();
/*
* We could pass in a CPU here if waiting in the queue and yield to
* the previous CPU in the queue.
*/
u32 new = queued_spin_encode_locked_val();
u32 prev;
/* Trylock succeeds only when unlocked and no queued nodes */
asm volatile(
"1: lwarx %0,0,%1,%3 # __queued_spin_trylock_nosteal \n"
" cmpwi 0,%0,0 \n"
" bne- 2f \n"
" stwcx. %2,0,%1 \n"
" bne- 1b \n"
"\t" PPC_ACQUIRE_BARRIER " \n"
"2: \n"
: "=&r" (prev)
: "r" (&lock->val), "r" (new),
"i" (_Q_SPIN_EH_HINT)
: "cr0", "memory");
return likely(prev == 0);
}
static __always_inline void pv_kick(int cpu)
static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock)
{
prod_cpu(cpu);
u32 new = queued_spin_encode_locked_val();
u32 prev, tmp;
/* Trylock may get ahead of queued nodes if it finds unlocked */
asm volatile(
"1: lwarx %0,0,%2,%5 # __queued_spin_trylock_steal \n"
" andc. %1,%0,%4 \n"
" bne- 2f \n"
" and %1,%0,%4 \n"
" or %1,%1,%3 \n"
" stwcx. %1,0,%2 \n"
" bne- 1b \n"
"\t" PPC_ACQUIRE_BARRIER " \n"
"2: \n"
: "=&r" (prev), "=&r" (tmp)
: "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK),
"i" (_Q_SPIN_EH_HINT)
: "cr0", "memory");
return likely(!(prev & ~_Q_TAIL_CPU_MASK));
}
extern void __pv_init_lock_hash(void);
static __always_inline int queued_spin_trylock(struct qspinlock *lock)
{
if (!_Q_SPIN_TRY_LOCK_STEAL)
return __queued_spin_trylock_nosteal(lock);
else
return __queued_spin_trylock_steal(lock);
}
static inline void pv_spinlocks_init(void)
void queued_spin_lock_slowpath(struct qspinlock *lock);
static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
__pv_init_lock_hash();
if (!queued_spin_trylock(lock))
queued_spin_lock_slowpath(lock);
}
#endif
static inline void queued_spin_unlock(struct qspinlock *lock)
{
smp_store_release(&lock->locked, 0);
if (_Q_SPIN_MISO_UNLOCK)
asm volatile("miso" ::: "memory");
}
/*
* Queued spinlocks rely heavily on smp_cond_load_relaxed() to busy-wait,
* which was found to have performance problems if implemented with
* the preferred spin_begin()/spin_end() SMT priority pattern. Use the
* generic version instead.
*/
#define arch_spin_is_locked(l) queued_spin_is_locked(l)
#define arch_spin_is_contended(l) queued_spin_is_contended(l)
#define arch_spin_value_unlocked(l) queued_spin_value_unlocked(l)
#define arch_spin_lock(l) queued_spin_lock(l)
#define arch_spin_trylock(l) queued_spin_trylock(l)
#define arch_spin_unlock(l) queued_spin_unlock(l)
#include <asm-generic/qspinlock.h>
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void pv_spinlocks_init(void);
#else
static inline void pv_spinlocks_init(void) { }
#endif
#endif /* _ASM_POWERPC_QSPINLOCK_H */
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_POWERPC_QSPINLOCK_PARAVIRT_H
#define _ASM_POWERPC_QSPINLOCK_PARAVIRT_H
EXPORT_SYMBOL(__pv_queued_spin_unlock);
#endif /* _ASM_POWERPC_QSPINLOCK_PARAVIRT_H */
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _ASM_POWERPC_QSPINLOCK_TYPES_H
#define _ASM_POWERPC_QSPINLOCK_TYPES_H
#include <linux/types.h>
#include <asm/byteorder.h>
typedef struct qspinlock {
union {
u32 val;
#ifdef __LITTLE_ENDIAN
struct {
u16 locked;
u8 reserved[2];
};
#else
struct {
u8 reserved[2];
u16 locked;
};
#endif
};
} arch_spinlock_t;
#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } }
/*
* Bitfields in the lock word:
*
* 0: locked bit
* 1-14: lock holder cpu
* 15: lock owner or queuer vcpus observed to be preempted bit
* 16: must queue bit
* 17-31: tail cpu (+1)
*/
#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\
<< _Q_ ## type ## _OFFSET)
/* 0x00000001 */
#define _Q_LOCKED_OFFSET 0
#define _Q_LOCKED_BITS 1
#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
/* 0x00007ffe */
#define _Q_OWNER_CPU_OFFSET 1
#define _Q_OWNER_CPU_BITS 14
#define _Q_OWNER_CPU_MASK _Q_SET_MASK(OWNER_CPU)
#if CONFIG_NR_CPUS > (1U << _Q_OWNER_CPU_BITS)
#error "qspinlock does not support such large CONFIG_NR_CPUS"
#endif
/* 0x00008000 */
#define _Q_SLEEPY_OFFSET 15
#define _Q_SLEEPY_BITS 1
#define _Q_SLEEPY_VAL (1U << _Q_SLEEPY_OFFSET)
/* 0x00010000 */
#define _Q_MUST_Q_OFFSET 16
#define _Q_MUST_Q_BITS 1
#define _Q_MUST_Q_VAL (1U << _Q_MUST_Q_OFFSET)
/* 0xfffe0000 */
#define _Q_TAIL_CPU_OFFSET 17
#define _Q_TAIL_CPU_BITS 15
#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
#if CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)
#error "qspinlock does not support such large CONFIG_NR_CPUS"
#endif
#endif /* _ASM_POWERPC_QSPINLOCK_TYPES_H */
......@@ -13,7 +13,7 @@
/* See include/linux/spinlock.h */
#define smp_mb__after_spinlock() smp_mb()
#ifndef CONFIG_PARAVIRT_SPINLOCKS
#ifndef CONFIG_PPC_QUEUED_SPINLOCKS
static inline void pv_spinlocks_init(void) { }
#endif
......
......@@ -7,7 +7,7 @@
#endif
#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
#include <asm-generic/qspinlock_types.h>
#include <asm/qspinlock_types.h>
#include <asm-generic/qrwlock_types.h>
#else
#include <asm/simple_spinlock_types.h>
......
......@@ -52,7 +52,9 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
memcpy_64.o copy_mc_64.o
ifndef CONFIG_PPC_QUEUED_SPINLOCKS
ifdef CONFIG_PPC_QUEUED_SPINLOCKS
obj-$(CONFIG_SMP) += qspinlock.o
else
obj64-$(CONFIG_SMP) += locks.o
endif
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment