Commit f7853c34 authored by Peter Zijlstra's avatar Peter Zijlstra

locking/rtmutex: Fix task->pi_waiters integrity

Henry reported that rt_mutex_adjust_prio_check() has an ordering
problem and puts the lie to the comment in [7]. Sharing the sort key
between lock->waiters and owner->pi_waiters *does* create problems,
since unlike what the comment claims, holding [L] is insufficient.

Notably, consider:

	A
      /   \
     M1   M2
     |     |
     B     C

That is, task A owns both M1 and M2, B and C block on them. In this
case a concurrent chain walk (B & C) will modify their resp. sort keys
in [7] while holding M1->wait_lock and M2->wait_lock. So holding [L]
is meaningless, they're different Ls.

This then gives rise to a race condition between [7] and [11], where
the requeue of pi_waiters will observe an inconsistent tree order.

	B				C

  (holds M1->wait_lock,		(holds M2->wait_lock,
   holds B->pi_lock)		 holds A->pi_lock)

  [7]
  waiter_update_prio();
  ...
  [8]
  raw_spin_unlock(B->pi_lock);
  ...
  [10]
  raw_spin_lock(A->pi_lock);

				[11]
				rt_mutex_enqueue_pi();
				// observes inconsistent A->pi_waiters
				// tree order

Fixing this means either extending the range of the owner lock from
[10-13] to [6-13], with the immediate problem that this means [6-8]
hold both blocked and owner locks, or duplicating the sort key.

Since the locking in chain walk is horrible enough without having to
consider pi_lock nesting rules, duplicate the sort key instead.

By giving each tree their own sort key, the above race becomes
harmless, if C sees B at the old location, then B will correct things
(if they need correcting) when it walks up the chain and reaches A.

Fixes: fb00aca4 ("rtmutex: Turn the plist into an rb-tree")
Reported-by: default avatarHenry Wu <triangletrap12@gmail.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarThomas Gleixner <tglx@linutronix.de>
Tested-by: default avatarHenry Wu <triangletrap12@gmail.com>
Link: https://lkml.kernel.org/r/20230707161052.GF2883469%40hirez.programming.kicks-ass.net
parent fdf0eaf1
This diff is collapsed.
...@@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task) ...@@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags); raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on; waiter = task->pi_blocked_on;
if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags); raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return; return;
} }
......
...@@ -17,27 +17,44 @@ ...@@ -17,27 +17,44 @@
#include <linux/rtmutex.h> #include <linux/rtmutex.h>
#include <linux/sched/wake_q.h> #include <linux/sched/wake_q.h>
/*
* This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
* separate trees and they need their own copy of the sort keys because of
* different locking requirements.
*
* @entry: rbtree node to enqueue into the waiters tree
* @prio: Priority of the waiter
* @deadline: Deadline of the waiter if applicable
*
* See rt_waiter_node_less() and waiter_*_prio().
*/
struct rt_waiter_node {
struct rb_node entry;
int prio;
u64 deadline;
};
/* /*
* This is the control structure for tasks blocked on a rt_mutex, * This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task. * which is allocated on the kernel stack on of the blocked task.
* *
* @tree_entry: pi node to enqueue into the mutex waiters tree * @tree: node to enqueue into the mutex waiters tree
* @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task * @task: task reference to the blocked task
* @lock: Pointer to the rt_mutex on which the waiter blocks * @lock: Pointer to the rt_mutex on which the waiter blocks
* @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
* @prio: Priority of the waiter
* @deadline: Deadline of the waiter if applicable
* @ww_ctx: WW context pointer * @ww_ctx: WW context pointer
*
* @tree is ordered by @lock->wait_lock
* @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/ */
struct rt_mutex_waiter { struct rt_mutex_waiter {
struct rb_node tree_entry; struct rt_waiter_node tree;
struct rb_node pi_tree_entry; struct rt_waiter_node pi_tree;
struct task_struct *task; struct task_struct *task;
struct rt_mutex_base *lock; struct rt_mutex_base *lock;
unsigned int wake_state; unsigned int wake_state;
int prio;
u64 deadline;
struct ww_acquire_ctx *ww_ctx; struct ww_acquire_ctx *ww_ctx;
}; };
...@@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, ...@@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
{ {
struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rb_node *leftmost = rb_first_cached(&lock->waiters);
return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
} }
static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
...@@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base * ...@@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *
struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL; struct rt_mutex_waiter *w = NULL;
lockdep_assert_held(&lock->wait_lock);
if (leftmost) { if (leftmost) {
w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
BUG_ON(w->lock != lock); BUG_ON(w->lock != lock);
} }
return w; return w;
...@@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p) ...@@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p)
static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{ {
lockdep_assert_held(&p->pi_lock);
return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
pi_tree_entry); pi_tree.entry);
} }
#define RT_MUTEX_HAS_WAITERS 1UL #define RT_MUTEX_HAS_WAITERS 1UL
...@@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) ...@@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
{ {
debug_rt_mutex_init_waiter(waiter); debug_rt_mutex_init_waiter(waiter);
RB_CLEAR_NODE(&waiter->pi_tree_entry); RB_CLEAR_NODE(&waiter->pi_tree.entry);
RB_CLEAR_NODE(&waiter->tree_entry); RB_CLEAR_NODE(&waiter->tree.entry);
waiter->wake_state = TASK_NORMAL; waiter->wake_state = TASK_NORMAL;
waiter->task = NULL; waiter->task = NULL;
} }
......
...@@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock) ...@@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock)
struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
if (!n) if (!n)
return NULL; return NULL;
return rb_entry(n, struct rt_mutex_waiter, tree_entry); return rb_entry(n, struct rt_mutex_waiter, tree.entry);
} }
static inline struct rt_mutex_waiter * static inline struct rt_mutex_waiter *
__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) __ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
{ {
struct rb_node *n = rb_next(&w->tree_entry); struct rb_node *n = rb_next(&w->tree.entry);
if (!n) if (!n)
return NULL; return NULL;
return rb_entry(n, struct rt_mutex_waiter, tree_entry); return rb_entry(n, struct rt_mutex_waiter, tree.entry);
} }
static inline struct rt_mutex_waiter * static inline struct rt_mutex_waiter *
__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
{ {
struct rb_node *n = rb_prev(&w->tree_entry); struct rb_node *n = rb_prev(&w->tree.entry);
if (!n) if (!n)
return NULL; return NULL;
return rb_entry(n, struct rt_mutex_waiter, tree_entry); return rb_entry(n, struct rt_mutex_waiter, tree.entry);
} }
static inline struct rt_mutex_waiter * static inline struct rt_mutex_waiter *
...@@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock) ...@@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock)
struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
if (!n) if (!n)
return NULL; return NULL;
return rb_entry(n, struct rt_mutex_waiter, tree_entry); return rb_entry(n, struct rt_mutex_waiter, tree.entry);
} }
static inline void static inline void
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment