Commit 35d2b7ff authored by John Fastabend's avatar John Fastabend Committed by Daniel Borkmann

bpf, sockmap: Fix preempt_rt splat when using raw_spin_lock_t

Sockmap and sockhash maps are a collection of psocks that are
objects representing a socket plus a set of metadata needed
to manage the BPF programs associated with the socket. These
maps use the stab->lock to protect from concurrent operations
on the maps, e.g. trying to insert to objects into the array
at the same time in the same slot. Additionally, a sockhash map
has a bucket lock to protect iteration and insert/delete into
the hash entry.

Each psock has a psock->link which is a linked list of all the
maps that a psock is attached to. This allows a psock (socket)
to be included in multiple sockmap and sockhash maps. This
linked list is protected the psock->link_lock.

They _must_ be nested correctly to avoid deadlock:

  lock(stab->lock)
    : do BPF map operations and psock insert/delete
    lock(psock->link_lock)
       : add map to psock linked list of maps
    unlock(psock->link_lock)
  unlock(stab->lock)

For non PREEMPT_RT kernels both raw_spin_lock_t and spin_lock_t
are guaranteed to not sleep. But, with PREEMPT_RT kernels the
spin_lock_t variants may sleep. In the current code we have
many patterns like this:

   rcu_critical_section:
      raw_spin_lock(stab->lock)
         spin_lock(psock->link_lock) <- may sleep ouch
         spin_unlock(psock->link_lock)
      raw_spin_unlock(stab->lock)
   rcu_critical_section

Nesting spin_lock() inside a raw_spin_lock() violates locking
rules for PREEMPT_RT kernels. And additionally we do alloc(GFP_ATOMICS)
inside the stab->lock, but those might sleep on PREEMPT_RT kernels.
The result is splats like this:

./test_progs -t sockmap_basic
[   33.344330] bpf_testmod: loading out-of-tree module taints kernel.
[   33.441933]
[   33.442089] =============================
[   33.442421] [ BUG: Invalid wait context ]
[   33.442763] 6.5.0-rc5-01731-gec0ded2e #4958 Tainted: G           O
[   33.443320] -----------------------------
[   33.443624] test_progs/2073 is trying to lock:
[   33.443960] ffff888102a1c290 (&psock->link_lock){....}-{3:3}, at: sock_map_update_common+0x2c2/0x3d0
[   33.444636] other info that might help us debug this:
[   33.444991] context-{5:5}
[   33.445183] 3 locks held by test_progs/2073:
[   33.445498]  #0: ffff88811a208d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: sock_map_update_elem_sys+0xff/0x330
[   33.446159]  #1: ffffffff842539e0 (rcu_read_lock){....}-{1:3}, at: sock_map_update_elem_sys+0xf5/0x330
[   33.446809]  #2: ffff88810d687240 (&stab->lock){+...}-{2:2}, at: sock_map_update_common+0x177/0x3d0
[   33.447445] stack backtrace:
[   33.447655] CPU: 10 PID

To fix observe we can't readily remove the allocations (for that
we would need to use/create something similar to bpf_map_alloc). So
convert raw_spin_lock_t to spin_lock_t. We note that sock_map_update
that would trigger the allocate and potential sleep is only allowed
through sys_bpf ops and via sock_ops which precludes hw interrupts
and low level atomic sections in RT preempt kernel. On non RT
preempt kernel there are no changes here and spin locks sections
and alloc(GFP_ATOMIC) are still not sleepable.
Signed-off-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230830053517.166611-1-john.fastabend@gmail.com
parent be4033d3
...@@ -18,7 +18,7 @@ struct bpf_stab { ...@@ -18,7 +18,7 @@ struct bpf_stab {
struct bpf_map map; struct bpf_map map;
struct sock **sks; struct sock **sks;
struct sk_psock_progs progs; struct sk_psock_progs progs;
raw_spinlock_t lock; spinlock_t lock;
}; };
#define SOCK_CREATE_FLAG_MASK \ #define SOCK_CREATE_FLAG_MASK \
...@@ -44,7 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) ...@@ -44,7 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&stab->map, attr); bpf_map_init_from_attr(&stab->map, attr);
raw_spin_lock_init(&stab->lock); spin_lock_init(&stab->lock);
stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries *
sizeof(struct sock *), sizeof(struct sock *),
...@@ -411,7 +411,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, ...@@ -411,7 +411,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
struct sock *sk; struct sock *sk;
int err = 0; int err = 0;
raw_spin_lock_bh(&stab->lock); spin_lock_bh(&stab->lock);
sk = *psk; sk = *psk;
if (!sk_test || sk_test == sk) if (!sk_test || sk_test == sk)
sk = xchg(psk, NULL); sk = xchg(psk, NULL);
...@@ -421,7 +421,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, ...@@ -421,7 +421,7 @@ static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
else else
err = -EINVAL; err = -EINVAL;
raw_spin_unlock_bh(&stab->lock); spin_unlock_bh(&stab->lock);
return err; return err;
} }
...@@ -487,7 +487,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, ...@@ -487,7 +487,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
psock = sk_psock(sk); psock = sk_psock(sk);
WARN_ON_ONCE(!psock); WARN_ON_ONCE(!psock);
raw_spin_lock_bh(&stab->lock); spin_lock_bh(&stab->lock);
osk = stab->sks[idx]; osk = stab->sks[idx];
if (osk && flags == BPF_NOEXIST) { if (osk && flags == BPF_NOEXIST) {
ret = -EEXIST; ret = -EEXIST;
...@@ -501,10 +501,10 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, ...@@ -501,10 +501,10 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
stab->sks[idx] = sk; stab->sks[idx] = sk;
if (osk) if (osk)
sock_map_unref(osk, &stab->sks[idx]); sock_map_unref(osk, &stab->sks[idx]);
raw_spin_unlock_bh(&stab->lock); spin_unlock_bh(&stab->lock);
return 0; return 0;
out_unlock: out_unlock:
raw_spin_unlock_bh(&stab->lock); spin_unlock_bh(&stab->lock);
if (psock) if (psock)
sk_psock_put(sk, psock); sk_psock_put(sk, psock);
out_free: out_free:
...@@ -835,7 +835,7 @@ struct bpf_shtab_elem { ...@@ -835,7 +835,7 @@ struct bpf_shtab_elem {
struct bpf_shtab_bucket { struct bpf_shtab_bucket {
struct hlist_head head; struct hlist_head head;
raw_spinlock_t lock; spinlock_t lock;
}; };
struct bpf_shtab { struct bpf_shtab {
...@@ -910,7 +910,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, ...@@ -910,7 +910,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
* is okay since it's going away only after RCU grace period. * is okay since it's going away only after RCU grace period.
* However, we need to check whether it's still present. * However, we need to check whether it's still present.
*/ */
raw_spin_lock_bh(&bucket->lock); spin_lock_bh(&bucket->lock);
elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
elem->key, map->key_size); elem->key, map->key_size);
if (elem_probe && elem_probe == elem) { if (elem_probe && elem_probe == elem) {
...@@ -918,7 +918,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, ...@@ -918,7 +918,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
sock_map_unref(elem->sk, elem); sock_map_unref(elem->sk, elem);
sock_hash_free_elem(htab, elem); sock_hash_free_elem(htab, elem);
} }
raw_spin_unlock_bh(&bucket->lock); spin_unlock_bh(&bucket->lock);
} }
static long sock_hash_delete_elem(struct bpf_map *map, void *key) static long sock_hash_delete_elem(struct bpf_map *map, void *key)
...@@ -932,7 +932,7 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) ...@@ -932,7 +932,7 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key)
hash = sock_hash_bucket_hash(key, key_size); hash = sock_hash_bucket_hash(key, key_size);
bucket = sock_hash_select_bucket(htab, hash); bucket = sock_hash_select_bucket(htab, hash);
raw_spin_lock_bh(&bucket->lock); spin_lock_bh(&bucket->lock);
elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
if (elem) { if (elem) {
hlist_del_rcu(&elem->node); hlist_del_rcu(&elem->node);
...@@ -940,7 +940,7 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key) ...@@ -940,7 +940,7 @@ static long sock_hash_delete_elem(struct bpf_map *map, void *key)
sock_hash_free_elem(htab, elem); sock_hash_free_elem(htab, elem);
ret = 0; ret = 0;
} }
raw_spin_unlock_bh(&bucket->lock); spin_unlock_bh(&bucket->lock);
return ret; return ret;
} }
...@@ -1000,7 +1000,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, ...@@ -1000,7 +1000,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
hash = sock_hash_bucket_hash(key, key_size); hash = sock_hash_bucket_hash(key, key_size);
bucket = sock_hash_select_bucket(htab, hash); bucket = sock_hash_select_bucket(htab, hash);
raw_spin_lock_bh(&bucket->lock); spin_lock_bh(&bucket->lock);
elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
if (elem && flags == BPF_NOEXIST) { if (elem && flags == BPF_NOEXIST) {
ret = -EEXIST; ret = -EEXIST;
...@@ -1026,10 +1026,10 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, ...@@ -1026,10 +1026,10 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
sock_map_unref(elem->sk, elem); sock_map_unref(elem->sk, elem);
sock_hash_free_elem(htab, elem); sock_hash_free_elem(htab, elem);
} }
raw_spin_unlock_bh(&bucket->lock); spin_unlock_bh(&bucket->lock);
return 0; return 0;
out_unlock: out_unlock:
raw_spin_unlock_bh(&bucket->lock); spin_unlock_bh(&bucket->lock);
sk_psock_put(sk, psock); sk_psock_put(sk, psock);
out_free: out_free:
sk_psock_free_link(link); sk_psock_free_link(link);
...@@ -1115,7 +1115,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) ...@@ -1115,7 +1115,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
for (i = 0; i < htab->buckets_num; i++) { for (i = 0; i < htab->buckets_num; i++) {
INIT_HLIST_HEAD(&htab->buckets[i].head); INIT_HLIST_HEAD(&htab->buckets[i].head);
raw_spin_lock_init(&htab->buckets[i].lock); spin_lock_init(&htab->buckets[i].lock);
} }
return &htab->map; return &htab->map;
...@@ -1147,11 +1147,11 @@ static void sock_hash_free(struct bpf_map *map) ...@@ -1147,11 +1147,11 @@ static void sock_hash_free(struct bpf_map *map)
* exists, psock exists and holds a ref to socket. That * exists, psock exists and holds a ref to socket. That
* lets us to grab a socket ref too. * lets us to grab a socket ref too.
*/ */
raw_spin_lock_bh(&bucket->lock); spin_lock_bh(&bucket->lock);
hlist_for_each_entry(elem, &bucket->head, node) hlist_for_each_entry(elem, &bucket->head, node)
sock_hold(elem->sk); sock_hold(elem->sk);
hlist_move_list(&bucket->head, &unlink_list); hlist_move_list(&bucket->head, &unlink_list);
raw_spin_unlock_bh(&bucket->lock); spin_unlock_bh(&bucket->lock);
/* Process removed entries out of atomic context to /* Process removed entries out of atomic context to
* block for socket lock before deleting the psock's * block for socket lock before deleting the psock's
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment