Commit 549af008 authored by Parav Pandit's avatar Parav Pandit Committed by Jason Gunthorpe

IB/core: Avoid deadlock during netlink message handling

When rdmacm module is not loaded, and when netlink message is received to
get char device info, it results into a deadlock due to recursive locking
of rdma_nl_mutex with the below call sequence.

[..]
  rdma_nl_rcv()
  mutex_lock()
   [..]
   rdma_nl_rcv_msg()
      ib_get_client_nl_info()
         request_module()
           iw_cm_init()
             rdma_nl_register()
               mutex_lock(); <- Deadlock, acquiring mutex again

Due to above call sequence, following call trace and deadlock is observed.

  kernel: __mutex_lock+0x35e/0x860
  kernel: ? __mutex_lock+0x129/0x860
  kernel: ? rdma_nl_register+0x1a/0x90 [ib_core]
  kernel: rdma_nl_register+0x1a/0x90 [ib_core]
  kernel: ? 0xffffffffc029b000
  kernel: iw_cm_init+0x34/0x1000 [iw_cm]
  kernel: do_one_initcall+0x67/0x2d4
  kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0
  kernel: do_init_module+0x5a/0x223
  kernel: load_module+0x1998/0x1e10
  kernel: ? __symbol_put+0x60/0x60
  kernel: __do_sys_finit_module+0x94/0xe0
  kernel: do_syscall_64+0x5a/0x270
  kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe

  process stack trace:
  [<0>] __request_module+0x1c9/0x460
  [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core]
  [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core]
  [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core]
  [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core]
  [<0>] netlink_unicast+0x179/0x220
  [<0>] netlink_sendmsg+0x2f6/0x3f0
  [<0>] sock_sendmsg+0x30/0x40
  [<0>] ___sys_sendmsg+0x27a/0x290
  [<0>] __sys_sendmsg+0x58/0xa0
  [<0>] do_syscall_64+0x5a/0x270
  [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe

To overcome this deadlock and to allow multiple netlink messages to
progress in parallel, following scheme is implemented.

1. Split the lock protecting the cb_table into a per-index lock, and make
   it a rwlock. This lock is used to ensure no callbacks are running after
   unregistration returns. Since a module will not be registered once it
   is already running callbacks, this avoids the deadlock.

2. Use smp_store_release() to update the cb_table during registration so
   that no lock is required. This avoids lockdep problems with thinking
   all the rwsems are the same lock class.

Fixes: 0e2d00eb ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload")
Link: https://lore.kernel.org/r/20191015080733.18625-1-leon@kernel.orgSigned-off-by: default avatarParav Pandit <parav@mellanox.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent a15542bb
......@@ -199,6 +199,7 @@ void ib_mad_cleanup(void);
int ib_sa_init(void);
void ib_sa_cleanup(void);
void rdma_nl_init(void);
void rdma_nl_exit(void);
int ib_nl_handle_resolve_resp(struct sk_buff *skb,
......
......@@ -2716,6 +2716,8 @@ static int __init ib_core_init(void)
goto err_comp_unbound;
}
rdma_nl_init();
ret = addr_init();
if (ret) {
pr_warn("Could't init IB address resolution\n");
......
......@@ -42,9 +42,12 @@
#include <linux/module.h>
#include "core_priv.h"
static DEFINE_MUTEX(rdma_nl_mutex);
static struct {
const struct rdma_nl_cbs *cb_table;
const struct rdma_nl_cbs *cb_table;
/* Synchronizes between ongoing netlink commands and netlink client
* unregistration.
*/
struct rw_semaphore sem;
} rdma_nl_types[RDMA_NL_NUM_CLIENTS];
bool rdma_nl_chk_listeners(unsigned int group)
......@@ -75,70 +78,53 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op)
return (op < max_num_ops[type]) ? true : false;
}
static bool
is_nl_valid(const struct sk_buff *skb, unsigned int type, unsigned int op)
static const struct rdma_nl_cbs *
get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op)
{
const struct rdma_nl_cbs *cb_table;
if (!is_nl_msg_valid(type, op))
return false;
/*
* Currently only NLDEV client is supporting netlink commands in
* non init_net net namespace.
*/
if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV)
return false;
return NULL;
if (!rdma_nl_types[type].cb_table) {
mutex_unlock(&rdma_nl_mutex);
request_module("rdma-netlink-subsys-%d", type);
mutex_lock(&rdma_nl_mutex);
}
cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
if (!cb_table) {
/*
* Didn't get valid reference of the table, attempt module
* load once.
*/
up_read(&rdma_nl_types[type].sem);
cb_table = rdma_nl_types[type].cb_table;
request_module("rdma-netlink-subsys-%d", type);
down_read(&rdma_nl_types[type].sem);
cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
}
if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
return false;
return true;
return NULL;
return cb_table;
}
void rdma_nl_register(unsigned int index,
const struct rdma_nl_cbs cb_table[])
{
mutex_lock(&rdma_nl_mutex);
if (!is_nl_msg_valid(index, 0)) {
/*
* All clients are not interesting in success/failure of
* this call. They want to see the print to error log and
* continue their initialization. Print warning for them,
* because it is programmer's error to be here.
*/
mutex_unlock(&rdma_nl_mutex);
WARN(true,
"The not-valid %u index was supplied to RDMA netlink\n",
index);
if (WARN_ON(!is_nl_msg_valid(index, 0)) ||
WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table)))
return;
}
if (rdma_nl_types[index].cb_table) {
mutex_unlock(&rdma_nl_mutex);
WARN(true,
"The %u index is already registered in RDMA netlink\n",
index);
return;
}
rdma_nl_types[index].cb_table = cb_table;
mutex_unlock(&rdma_nl_mutex);
/* Pairs with the READ_ONCE in is_nl_valid() */
smp_store_release(&rdma_nl_types[index].cb_table, cb_table);
}
EXPORT_SYMBOL(rdma_nl_register);
void rdma_nl_unregister(unsigned int index)
{
mutex_lock(&rdma_nl_mutex);
down_write(&rdma_nl_types[index].sem);
rdma_nl_types[index].cb_table = NULL;
mutex_unlock(&rdma_nl_mutex);
up_write(&rdma_nl_types[index].sem);
}
EXPORT_SYMBOL(rdma_nl_unregister);
......@@ -170,15 +156,21 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
unsigned int index = RDMA_NL_GET_CLIENT(type);
unsigned int op = RDMA_NL_GET_OP(type);
const struct rdma_nl_cbs *cb_table;
int err = -EINVAL;
if (!is_nl_valid(skb, index, op))
if (!is_nl_msg_valid(index, op))
return -EINVAL;
cb_table = rdma_nl_types[index].cb_table;
down_read(&rdma_nl_types[index].sem);
cb_table = get_cb_table(skb, index, op);
if (!cb_table)
goto done;
if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
!netlink_capable(skb, CAP_NET_ADMIN)) {
err = -EPERM;
goto done;
}
/*
* LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't
......@@ -186,8 +178,8 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
*/
if (index == RDMA_NL_LS) {
if (cb_table[op].doit)
return cb_table[op].doit(skb, nlh, extack);
return -EINVAL;
err = cb_table[op].doit(skb, nlh, extack);
goto done;
}
/* FIXME: Convert IWCM to properly handle doit callbacks */
if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
......@@ -195,14 +187,15 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
.dump = cb_table[op].dump,
};
if (c.dump)
return netlink_dump_start(skb->sk, skb, nlh, &c);
return -EINVAL;
err = netlink_dump_start(skb->sk, skb, nlh, &c);
goto done;
}
if (cb_table[op].doit)
return cb_table[op].doit(skb, nlh, extack);
return 0;
err = cb_table[op].doit(skb, nlh, extack);
done:
up_read(&rdma_nl_types[index].sem);
return err;
}
/*
......@@ -263,9 +256,7 @@ static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
static void rdma_nl_rcv(struct sk_buff *skb)
{
mutex_lock(&rdma_nl_mutex);
rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg);
mutex_unlock(&rdma_nl_mutex);
}
int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid)
......@@ -297,6 +288,14 @@ int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
}
EXPORT_SYMBOL(rdma_nl_multicast);
void rdma_nl_init(void)
{
int idx;
for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
init_rwsem(&rdma_nl_types[idx].sem);
}
void rdma_nl_exit(void)
{
int idx;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment