Commit 324e227e authored by Jason Gunthorpe's avatar Jason Gunthorpe

RDMA/device: Add ib_device_get_by_netdev()

Several drivers need to find the ib_device from a given netdev. rxe needs
this at speed in an unsleepable context, so choose to implement the
translation using a RCU safe hash table.

The hash table can have a many to one mapping. This is intended to support
some future case where multiple IB drivers (ie iWarp and RoCE) connect to
the same netdevs. driver_ids will need to be different to support this.

In the process this makes the struct ib_device and ib_port_data RCU safe
by deferring their kfrees.
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent c2261dd7
...@@ -40,6 +40,7 @@ ...@@ -40,6 +40,7 @@
#include <linux/netdevice.h> #include <linux/netdevice.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/hashtable.h>
#include <rdma/rdma_netlink.h> #include <rdma/rdma_netlink.h>
#include <rdma/ib_addr.h> #include <rdma/ib_addr.h>
#include <rdma/ib_cache.h> #include <rdma/ib_cache.h>
...@@ -134,6 +135,10 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, ...@@ -134,6 +135,10 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
!xa_is_err(entry); \ !xa_is_err(entry); \
(index)++, entry = xan_find_marked(xa, &(index), filter)) (index)++, entry = xan_find_marked(xa, &(index), filter))
/* RCU hash table mapping netdevice pointers to struct ib_port_data */
static DEFINE_SPINLOCK(ndev_hash_lock);
static DECLARE_HASHTABLE(ndev_hash, 5);
static void free_netdevs(struct ib_device *ib_dev); static void free_netdevs(struct ib_device *ib_dev);
static int ib_security_change(struct notifier_block *nb, unsigned long event, static int ib_security_change(struct notifier_block *nb, unsigned long event,
void *lsm_data); void *lsm_data);
...@@ -144,6 +149,12 @@ static struct notifier_block ibdev_lsm_nb = { ...@@ -144,6 +149,12 @@ static struct notifier_block ibdev_lsm_nb = {
.notifier_call = ib_security_change, .notifier_call = ib_security_change,
}; };
/* Pointer to the RCU head at the start of the ib_port_data array */
struct ib_port_data_rcu {
struct rcu_head rcu_head;
struct ib_port_data pdata[];
};
static int ib_device_check_mandatory(struct ib_device *device) static int ib_device_check_mandatory(struct ib_device *device)
{ {
#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
...@@ -295,9 +306,12 @@ static void ib_device_release(struct device *device) ...@@ -295,9 +306,12 @@ static void ib_device_release(struct device *device)
WARN_ON(refcount_read(&dev->refcount)); WARN_ON(refcount_read(&dev->refcount));
ib_cache_release_one(dev); ib_cache_release_one(dev);
ib_security_release_port_pkey_list(dev); ib_security_release_port_pkey_list(dev);
kfree(dev->port_data);
xa_destroy(&dev->client_data); xa_destroy(&dev->client_data);
kfree(dev); if (dev->port_data)
kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
pdata[0]),
rcu_head);
kfree_rcu(dev, rcu_head);
} }
static int ib_device_uevent(struct device *device, static int ib_device_uevent(struct device *device,
...@@ -468,6 +482,7 @@ static void remove_client_context(struct ib_device *device, ...@@ -468,6 +482,7 @@ static void remove_client_context(struct ib_device *device,
static int alloc_port_data(struct ib_device *device) static int alloc_port_data(struct ib_device *device)
{ {
struct ib_port_data_rcu *pdata_rcu;
unsigned int port; unsigned int port;
if (device->port_data) if (device->port_data)
...@@ -484,17 +499,26 @@ static int alloc_port_data(struct ib_device *device) ...@@ -484,17 +499,26 @@ static int alloc_port_data(struct ib_device *device)
* Therefore port_data is declared as a 1 based array with potential * Therefore port_data is declared as a 1 based array with potential
* empty slots at the beginning. * empty slots at the beginning.
*/ */
device->port_data = kcalloc(rdma_end_port(device) + 1, pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
sizeof(*device->port_data), GFP_KERNEL); rdma_end_port(device) + 1),
if (!device->port_data) GFP_KERNEL);
if (!pdata_rcu)
return -ENOMEM; return -ENOMEM;
/*
* The rcu_head is put in front of the port data array and the stored
* pointer is adjusted since we never need to see that member until
* kfree_rcu.
*/
device->port_data = pdata_rcu->pdata;
rdma_for_each_port (device, port) { rdma_for_each_port (device, port) {
struct ib_port_data *pdata = &device->port_data[port]; struct ib_port_data *pdata = &device->port_data[port];
pdata->ib_dev = device;
spin_lock_init(&pdata->pkey_list_lock); spin_lock_init(&pdata->pkey_list_lock);
INIT_LIST_HEAD(&pdata->pkey_list); INIT_LIST_HEAD(&pdata->pkey_list);
spin_lock_init(&pdata->netdev_lock); spin_lock_init(&pdata->netdev_lock);
INIT_HLIST_NODE(&pdata->ndev_hash_link);
} }
return 0; return 0;
} }
...@@ -1042,6 +1066,29 @@ int ib_query_port(struct ib_device *device, ...@@ -1042,6 +1066,29 @@ int ib_query_port(struct ib_device *device,
} }
EXPORT_SYMBOL(ib_query_port); EXPORT_SYMBOL(ib_query_port);
static void add_ndev_hash(struct ib_port_data *pdata)
{
unsigned long flags;
might_sleep();
spin_lock_irqsave(&ndev_hash_lock, flags);
if (hash_hashed(&pdata->ndev_hash_link)) {
hash_del_rcu(&pdata->ndev_hash_link);
spin_unlock_irqrestore(&ndev_hash_lock, flags);
/*
* We cannot do hash_add_rcu after a hash_del_rcu until the
* grace period
*/
synchronize_rcu();
spin_lock_irqsave(&ndev_hash_lock, flags);
}
if (pdata->netdev)
hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
(uintptr_t)pdata->netdev);
spin_unlock_irqrestore(&ndev_hash_lock, flags);
}
/** /**
* ib_device_set_netdev - Associate the ib_dev with an underlying net_device * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
* @ib_dev: Device to modify * @ib_dev: Device to modify
...@@ -1078,17 +1125,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, ...@@ -1078,17 +1125,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
pdata = &ib_dev->port_data[port]; pdata = &ib_dev->port_data[port];
spin_lock_irqsave(&pdata->netdev_lock, flags); spin_lock_irqsave(&pdata->netdev_lock, flags);
if (pdata->netdev == ndev) { old_ndev = rcu_dereference_protected(
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
if (old_ndev == ndev) {
spin_unlock_irqrestore(&pdata->netdev_lock, flags); spin_unlock_irqrestore(&pdata->netdev_lock, flags);
return 0; return 0;
} }
old_ndev = pdata->netdev;
if (ndev) if (ndev)
dev_hold(ndev); dev_hold(ndev);
pdata->netdev = ndev; rcu_assign_pointer(pdata->netdev, ndev);
spin_unlock_irqrestore(&pdata->netdev_lock, flags); spin_unlock_irqrestore(&pdata->netdev_lock, flags);
add_ndev_hash(pdata);
if (old_ndev) if (old_ndev)
dev_put(old_ndev); dev_put(old_ndev);
...@@ -1103,11 +1152,24 @@ static void free_netdevs(struct ib_device *ib_dev) ...@@ -1103,11 +1152,24 @@ static void free_netdevs(struct ib_device *ib_dev)
rdma_for_each_port (ib_dev, port) { rdma_for_each_port (ib_dev, port) {
struct ib_port_data *pdata = &ib_dev->port_data[port]; struct ib_port_data *pdata = &ib_dev->port_data[port];
struct net_device *ndev;
spin_lock_irqsave(&pdata->netdev_lock, flags); spin_lock_irqsave(&pdata->netdev_lock, flags);
if (pdata->netdev) { ndev = rcu_dereference_protected(
dev_put(pdata->netdev); pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
pdata->netdev = NULL; if (ndev) {
spin_lock(&ndev_hash_lock);
hash_del_rcu(&pdata->ndev_hash_link);
spin_unlock(&ndev_hash_lock);
/*
* If this is the last dev_put there is still a
* synchronize_rcu before the netdev is kfreed, so we
* can continue to rely on unlocked pointer
* comparisons after the put
*/
rcu_assign_pointer(pdata->netdev, NULL);
dev_put(ndev);
} }
spin_unlock_irqrestore(&pdata->netdev_lock, flags); spin_unlock_irqrestore(&pdata->netdev_lock, flags);
} }
...@@ -1132,7 +1194,8 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, ...@@ -1132,7 +1194,8 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
res = ib_dev->ops.get_netdev(ib_dev, port); res = ib_dev->ops.get_netdev(ib_dev, port);
else { else {
spin_lock(&pdata->netdev_lock); spin_lock(&pdata->netdev_lock);
res = pdata->netdev; res = rcu_dereference_protected(
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
if (res) if (res)
dev_hold(res); dev_hold(res);
spin_unlock(&pdata->netdev_lock); spin_unlock(&pdata->netdev_lock);
...@@ -1150,6 +1213,38 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, ...@@ -1150,6 +1213,38 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
return res; return res;
} }
/**
* ib_device_get_by_netdev - Find an IB device associated with a netdev
* @ndev: netdev to locate
* @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
*
* Find and hold an ib_device that is associated with a netdev via
* ib_device_set_netdev(). The caller must call ib_device_put() on the
* returned pointer.
*/
struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
enum rdma_driver_id driver_id)
{
struct ib_device *res = NULL;
struct ib_port_data *cur;
rcu_read_lock();
hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
(uintptr_t)ndev) {
if (rcu_access_pointer(cur->netdev) == ndev &&
(driver_id == RDMA_DRIVER_UNKNOWN ||
cur->ib_dev->driver_id == driver_id) &&
ib_device_try_get(cur->ib_dev)) {
res = cur->ib_dev;
break;
}
}
rcu_read_unlock();
return res;
}
EXPORT_SYMBOL(ib_device_get_by_netdev);
/** /**
* ib_enum_roce_netdev - enumerate all RoCE ports * ib_enum_roce_netdev - enumerate all RoCE ports
* @ib_dev : IB device we want to query * @ib_dev : IB device we want to query
......
...@@ -2198,6 +2198,8 @@ struct ib_port_immutable { ...@@ -2198,6 +2198,8 @@ struct ib_port_immutable {
}; };
struct ib_port_data { struct ib_port_data {
struct ib_device *ib_dev;
struct ib_port_immutable immutable; struct ib_port_immutable immutable;
spinlock_t pkey_list_lock; spinlock_t pkey_list_lock;
...@@ -2206,7 +2208,8 @@ struct ib_port_data { ...@@ -2206,7 +2208,8 @@ struct ib_port_data {
struct ib_port_cache cache; struct ib_port_cache cache;
spinlock_t netdev_lock; spinlock_t netdev_lock;
struct net_device *netdev; struct net_device __rcu *netdev;
struct hlist_node ndev_hash_link;
}; };
/* rdma netdev type - specifies protocol type */ /* rdma netdev type - specifies protocol type */
...@@ -2545,6 +2548,7 @@ struct ib_device { ...@@ -2545,6 +2548,7 @@ struct ib_device {
struct device *dma_device; struct device *dma_device;
struct ib_device_ops ops; struct ib_device_ops ops;
char name[IB_DEVICE_NAME_MAX]; char name[IB_DEVICE_NAME_MAX];
struct rcu_head rcu_head;
struct list_head event_handler_list; struct list_head event_handler_list;
spinlock_t event_handler_lock; spinlock_t event_handler_lock;
...@@ -3996,6 +4000,10 @@ static inline bool ib_device_try_get(struct ib_device *dev) ...@@ -3996,6 +4000,10 @@ static inline bool ib_device_try_get(struct ib_device *dev)
} }
void ib_device_put(struct ib_device *device); void ib_device_put(struct ib_device *device);
struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
enum rdma_driver_id driver_id);
struct ib_device *ib_device_get_by_name(const char *name,
enum rdma_driver_id driver_id);
struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
u16 pkey, const union ib_gid *gid, u16 pkey, const union ib_gid *gid,
const struct sockaddr *addr); const struct sockaddr *addr);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment