RDMA/mlx5: Fix locking in MR cache work queue

All of the members of mlx5_cache_ent must be accessed while holding the spinlock, add the missing spinlock in the __cache_work_func(). Using cache->stopped and flush_workqueue() is an inherently racy way to shutdown self-scheduling work on a queue. Replace it with ent->disabled under lock, and always check disabled before queuing any new work. Use cancel_work_sync() to shutdown the queue. Use READ_ONCE/WRITE_ONCE for dev->last_add to manage concurrency as coherency is less important here. Split fill_delay from the bitfield. C bitfield updates are not atomic and this is just a mess. Use READ_ONCE/WRITE_ONCE, but this could also use test_bit()/set_bit(). Link: https://lore.kernel.org/r/20200310082238.239865-11-leon@kernel.orgSigned-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

RDMA/mlx5: Fix locking in MR cache work queue
All of the members of mlx5_cache_ent must be accessed while holding the spinlock, add the missing spinlock in the __cache_work_func(). Using cache->stopped and flush_workqueue() is an inherently racy way to shutdown self-scheduling work on a queue. Replace it with ent->disabled under lock, and always check disabled before queuing any new work. Use cancel_work_sync() to shutdown the queue. Use READ_ONCE/WRITE_ONCE for dev->last_add to manage concurrency as coherency is less important here. Split fill_delay from the bitfield. C bitfield updates are not atomic and this is just a mess. Use READ_ONCE/WRITE_ONCE, but this could also use test_bit()/set_bit(). Link: https://lore.kernel.org/r/20200310082238.239865-11-leon@kernel.orgSigned-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
b9358bdb · Jason Gunthorpe · ad2d3ef4 · b9358bdb · b9358bdb
Commit b9358bdb authored Mar 10, 2020 by Jason Gunthorpe
Show whitespace changes
Inline Side-by-side

Showing with 80 additions and 46 deletions

drivers/infiniband/hw/mlx5/mlx5_ib.h drivers/infiniband/hw/mlx5/mlx5_ib.h +3 -2

drivers/infiniband/hw/mlx5/mr.c drivers/infiniband/hw/mlx5/mr.c +77 -44

No files found.
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -699,6 +699,8 @@ struct mlx5_cache_ent {
 	u32			access_mode;
 	u32			page;
+	u8 disabled:1;
 	/*
 	 * - available_mrs is the length of list head, ie the number of MRs
 	 *   available for immediate allocation.
@@ -725,7 +727,6 @@ struct mlx5_cache_ent {
 struct mlx5_mr_cache {
 	struct workqueue_struct *wq;
 	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
-	int			stopped;
 	struct dentry		*root;
 	unsigned long		last_add;
 };
@@ -995,10 +996,10 @@ struct mlx5_ib_dev {
 	 */
 	struct mutex			cap_mask_mutex;
 	u8				ib_active:1;
-	u8				fill_delay:1;
 	u8				is_rep:1;
 	u8				lag_active:1;
 	u8				wc_support:1;
+	u8				fill_delay;
 	struct umr_common		umrc;
 	/* sync used page count stats
 	 */

--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -113,13 +113,13 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	struct mlx5_cache_ent *ent = mr->cache_ent;
 	unsigned long flags;
-	spin_lock_irqsave(&ent->lock, flags);
-	ent->pending--;
-	spin_unlock_irqrestore(&ent->lock, flags);
 	if (status) {
 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 		kfree(mr);
-		dev->fill_delay = 1;
+		spin_lock_irqsave(&ent->lock, flags);
+		ent->pending--;
+		WRITE_ONCE(dev->fill_delay, 1);
+		spin_unlock_irqrestore(&ent->lock, flags);
 		mod_timer(&dev->delay_timer, jiffies + HZ);
 		return;
 	}
@@ -128,12 +128,13 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	mr->mmkey.key |= mlx5_idx_to_mkey(
 		MLX5_GET(create_mkey_out, mr->out, mkey_index));
-	dev->cache.last_add = jiffies;
+	WRITE_ONCE(dev->cache.last_add, jiffies);
 	spin_lock_irqsave(&ent->lock, flags);
 	list_add_tail(&mr->list, &ent->head);
 	ent->available_mrs++;
 	ent->total_mrs++;
+	ent->pending--;
 	/*
 	 * Creating is always done in response to some demand, so do not call
 	 * queue_adjust_cache_locked().
@@ -159,11 +160,6 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	for (i = 0; i < num; i++) {
-		if (ent->pending >= MAX_PENDING_REG_MR) {
-			err = -EAGAIN;
-			break;
-		}
 		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 		if (!mr) {
 			err = -ENOMEM;
@@ -184,6 +180,12 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 		MLX5_SET(mkc, mkc, log_page_size, ent->page);
 		spin_lock_irq(&ent->lock);
+		if (ent->pending >= MAX_PENDING_REG_MR) {
+			err = -EAGAIN;
+			spin_unlock_irq(&ent->lock);
+			kfree(mr);
+			break;
+		}
 		ent->pending++;
 		spin_unlock_irq(&ent->lock);
 		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
@@ -204,15 +206,13 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 	return err;
 }
-static void remove_cache_mr(struct mlx5_cache_ent *ent)
+static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_mr *mr;
-	spin_lock_irq(&ent->lock);
+	lockdep_assert_held(&ent->lock);
-	if (list_empty(&ent->head)) {
+	if (list_empty(&ent->head))
-		spin_unlock_irq(&ent->lock);
 		return;
-	}
 	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 	list_del(&mr->list);
 	ent->available_mrs--;
@@ -220,6 +220,7 @@ static void remove_cache_mr(struct mlx5_cache_ent *ent)
 	spin_unlock_irq(&ent->lock);
 	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
 	kfree(mr);
+	spin_lock_irq(&ent->lock);
 }
 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
@@ -248,9 +249,7 @@ static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 			} else
 				return 0;
 		} else {
-			spin_unlock_irq(&ent->lock);
+			remove_cache_mr_locked(ent);
-			remove_cache_mr(ent);
-			spin_lock_irq(&ent->lock);
 		}
 	}
 }
@@ -359,16 +358,21 @@ static const struct file_operations limit_fops = {
 	.read	= limit_read,
 };
-static int someone_adding(struct mlx5_mr_cache *cache)
+static bool someone_adding(struct mlx5_mr_cache *cache)
 {
-	int i;
+	unsigned int i;
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		if (cache->ent[i].available_mrs < cache->ent[i].limit)
+		struct mlx5_cache_ent *ent = &cache->ent[i];
-			return 1;
+		bool ret;
-	}
-	return 0;
+		spin_lock_irq(&ent->lock);
+		ret = ent->available_mrs < ent->limit;
+		spin_unlock_irq(&ent->lock);
+		if (ret)
+			return true;
+	}
+	return false;
 }
 /*
@@ -380,6 +384,8 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 {
 	lockdep_assert_held(&ent->lock);
+	if (ent->disabled)
+		return;
 	if (ent->available_mrs < ent->limit ||
 	    ent->available_mrs > 2 * ent->limit)
 		queue_work(ent->dev->cache.wq, &ent->work);
@@ -391,27 +397,42 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 	struct mlx5_mr_cache *cache = &dev->cache;
 	int err;
-	if (cache->stopped)
+	spin_lock_irq(&ent->lock);
-		return;
+	if (ent->disabled)
+		goto out;
-	if (ent->available_mrs < 2 * ent->limit && !dev->fill_delay) {
+	if (ent->available_mrs + ent->pending < 2 * ent->limit &&
+	    !READ_ONCE(dev->fill_delay)) {
+		spin_unlock_irq(&ent->lock);
 		err = add_keys(ent, 1);
-		if (ent->available_mrs < 2 * ent->limit) {
+		spin_lock_irq(&ent->lock);
+		if (ent->disabled)
+			goto out;
+		if (err) {
 			if (err == -EAGAIN) {
 				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
 					    ent->order);
 				queue_delayed_work(cache->wq, &ent->dwork,
 						   msecs_to_jiffies(3));
-			} else if (err) {
+			} else {
-				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
+				mlx5_ib_warn(
+					dev,
+					"command failed order %d, err %d\n",
 					ent->order, err);
 				queue_delayed_work(cache->wq, &ent->dwork,
 						   msecs_to_jiffies(1000));
-			} else {
-				queue_work(cache->wq, &ent->work);
 			}
 		}
+		/*
+		 * Once we start populating due to hitting a low water mark
+		 * continue until we pass the high water mark.
+		 */
+		if (ent->available_mrs + ent->pending < 2 * ent->limit)
+			queue_work(cache->wq, &ent->work);
 	} else if (ent->available_mrs > 2 * ent->limit) {
+		bool need_delay;
 		/*
 		 * The remove_cache_mr() logic is performed as garbage
 		 * collection task. Such task is intended to be run when no
@@ -424,15 +445,20 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		 * the garbage collection work to try to run in next cycle, in
 		 * order to free CPU resources to other tasks.
 		 */
-		if (!need_resched() && !someone_adding(cache) &&
+		spin_unlock_irq(&ent->lock);
-		    time_after(jiffies, cache->last_add + 300 * HZ)) {
+		need_delay = need_resched() || someone_adding(cache) ||
-			remove_cache_mr(ent);
+			     time_after(jiffies,
-			if (ent->available_mrs > ent->limit)
+					READ_ONCE(cache->last_add) + 300 * HZ);
-				queue_work(cache->wq, &ent->work);
+		spin_lock_irq(&ent->lock);
-		} else {
+		if (ent->disabled)
+			goto out;
+		if (need_delay)
 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
+		remove_cache_mr_locked(ent);
+		queue_adjust_cache_locked(ent);
 	}
-	}
+out:
+	spin_unlock_irq(&ent->lock);
 }
 static void delayed_cache_work_func(struct work_struct *work)
@@ -613,7 +639,7 @@ static void delay_time_func(struct timer_list *t)
 {
 	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
-	dev->fill_delay = 0;
+	WRITE_ONCE(dev->fill_delay, 0);
 }
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
@@ -673,13 +699,20 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 {
-	int i;
+	unsigned int i;
 	if (!dev->cache.wq)
 		return 0;
-	dev->cache.stopped = 1;
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-	flush_workqueue(dev->cache.wq);
+		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
+		spin_lock_irq(&ent->lock);
+		ent->disabled = true;
+		spin_unlock_irq(&ent->lock);
+		cancel_work_sync(&ent->work);
+		cancel_delayed_work_sync(&ent->dwork);
+	}
 	mlx5_mr_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);