[PATCH] Fix an SMP+preempt latency problem

Here is spin_lock(): #define spin_lock(lock) \ do { \ preempt_disable(); \ _raw_spin_lock(lock); \ } while(0) Here is the scenario: CPU0: spin_lock(some_lock); do_very_long_thing(); /* This has cond_resched()s in it */ CPU1: spin_lock(some_lock); Now suppose that the scheduler tries to schedule a task on CPU1. Nothing happens, because CPU1 is spinning on the lock with preemption disabled. CPU0 will happliy hold the lock for a long time because nobody has set need_resched() against CPU0. This problem can cause scheduling latencies of many tens of milliseconds on SMP on kernels which handle UP quite happily. This patch fixes the problem by changing the spin_lock() and write_lock() contended slowpath to spin on the lock by hand, while polling for preemption requests. I would have done read_lock() too, but we don't seem to have read_trylock() primitives. The patch also shrinks the kernel by 30k due to not having separate out-of-line spinning code for each spin_lock() callsite.

[PATCH] Fix an SMP+preempt latency problem
Here is spin_lock(): #define spin_lock(lock) \ do { \ preempt_disable(); \ _raw_spin_lock(lock); \ } while(0) Here is the scenario: CPU0: spin_lock(some_lock); do_very_long_thing(); /* This has cond_resched()s in it */ CPU1: spin_lock(some_lock); Now suppose that the scheduler tries to schedule a task on CPU1. Nothing happens, because CPU1 is spinning on the lock with preemption disabled. CPU0 will happliy hold the lock for a long time because nobody has set need_resched() against CPU0. This problem can cause scheduling latencies of many tens of milliseconds on SMP on kernels which handle UP quite happily. This patch fixes the problem by changing the spin_lock() and write_lock() contended slowpath to spin on the lock by hand, while polling for preemption requests. I would have done read_lock() too, but we don't seem to have read_trylock() primitives. The patch also shrinks the kernel by 30k due to not having separate out-of-line spinning code for each spin_lock() callsite.
2faf4338 · Andrew Morton · Richard Henderson · b4adddd6 · 2faf4338 · 2faf4338
Commit 2faf4338 authored Jan 10, 2003 by Andrew Morton Committed by Richard Henderson Jan 10, 2003
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 17 deletions

include/linux/spinlock.h include/linux/spinlock.h +39 -17

kernel/ksyms.c kernel/ksyms.c +4 -0

kernel/sched.c kernel/sched.c +47 -0

No files found.
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -85,31 +85,37 @@
 * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
 * methods are defined as nops in the case they are not required.
 */
-#define spin_lock(lock)	\
-do { \
-	preempt_disable(); \
-	_raw_spin_lock(lock); \
-} while(0)
-
 #define spin_trylock(lock)	({preempt_disable(); _raw_spin_trylock(lock) ? \
 				1 : ({preempt_enable(); 0;});})

-#define spin_unlock(lock) \
+#define write_trylock(lock)	({preempt_disable();_raw_write_trylock(lock) ? \
+				1 : ({preempt_enable(); 0;});})
+
+/* Where's read_trylock? */
+
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+void __preempt_spin_lock(spinlock_t *lock);
+void __preempt_write_lock(rwlock_t *lock);
+
+#define spin_lock(lock) \
 do { \
-	_raw_spin_unlock(lock); \
-	preempt_enable(); \
+	preempt_disable(); \
+	if (unlikely(!_raw_spin_trylock(lock))) \
+		__preempt_spin_lock(lock); \
 } while (0)

-#define read_lock(lock)	\
+#define write_lock(lock) \
 do { \
 	preempt_disable(); \
-	_raw_read_lock(lock); \
-} while(0)
+	if (unlikely(!_raw_write_trylock(lock))) \
+		__preempt_write_lock(lock); \
+} while (0)

-#define read_unlock(lock) \
+#else
+#define spin_lock(lock)	\
 do { \
-	_raw_read_unlock(lock); \
-	preempt_enable(); \
+	preempt_disable(); \
+	_raw_spin_lock(lock); \
 } while(0)

 #define write_lock(lock) \
@@ -117,6 +123,19 @@ do { \
 	preempt_disable(); \
 	_raw_write_lock(lock); \
 } while(0)
+#endif
+
+#define read_lock(lock)	\
+do { \
+	preempt_disable(); \
+	_raw_read_lock(lock); \
+} while(0)
+
+#define spin_unlock(lock) \
+do { \
+	_raw_spin_unlock(lock); \
+	preempt_enable(); \
+} while (0)

 #define write_unlock(lock) \
 do { \
@@ -124,8 +143,11 @@ do { \
 	preempt_enable(); \
 } while(0)

-#define write_trylock(lock)	({preempt_disable();_raw_write_trylock(lock) ? \
-				1 : ({preempt_enable(); 0;});})
+#define read_unlock(lock) \
+do { \
+	_raw_read_unlock(lock); \
+	preempt_enable(); \
+} while(0)

 #define spin_lock_irqsave(lock, flags) \
 do { \

--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -491,6 +491,10 @@ EXPORT_SYMBOL(do_settimeofday);
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 EXPORT_SYMBOL(__might_sleep);
 #endif
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+EXPORT_SYMBOL(__preempt_spin_lock);
+EXPORT_SYMBOL(__preempt_write_lock);
+#endif
 #if !defined(__ia64__)
 EXPORT_SYMBOL(loops_per_jiffy);
 #endif

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2278,3 +2278,50 @@ void __might_sleep(char *file, int line)
 #endif
 }
 #endif
+
+
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+/*
+ * This could be a long-held lock.  If another CPU holds it for a long time,
+ * and that CPU is not asked to reschedule then *this* CPU will spin on the
+ * lock for a long time, even if *this* CPU is asked to reschedule.
+ *
+ * So what we do here, in the slow (contended) path is to spin on the lock by
+ * hand while permitting preemption.
+ *
+ * Called inside preempt_disable().
+ */
+void __preempt_spin_lock(spinlock_t *lock)
+{
+	if (preempt_count() > 1) {
+		_raw_spin_lock(lock);
+		return;
+	}
+
+	while (!_raw_spin_trylock(lock)) {
+		if (need_resched()) {
+			preempt_enable_no_resched();
+			__cond_resched();
+			preempt_disable();
+		}
+		cpu_relax();
+	}
+}
+
+void __preempt_write_lock(rwlock_t *lock)
+{
+	if (preempt_count() > 1) {
+		_raw_write_lock(lock);
+		return;
+	}
+
+	while (!_raw_write_trylock(lock)) {
+		if (need_resched()) {
+			preempt_enable_no_resched();
+			__cond_resched();
+			preempt_disable();
+		}
+		cpu_relax();
+	}
+}
+#endif