workqueue: Allow cancel_work_sync() and disable_work() from atomic contexts on BH work items

Now that work_grab_pending() can always grab the PENDING bit without sleeping, the only thing that prevents allowing cancel_work_sync() of a BH work item from an atomic context is the flushing of the in-flight instance. When we're flushing a BH work item for cancel_work_sync(), we know that the work item is not queued and must be executing in a BH context, which means that it's safe to busy-wait for its completion from a non-hardirq atomic context. This patch updates __flush_work() so that it busy-waits when flushing a BH work item for cancel_work_sync(). might_sleep() is pushed from start_flush_work() to its callers - when operating on a BH work item, __cancel_work_sync() now enforces !in_hardirq() instead of might_sleep(). This allows cancel_work_sync() and disable_work() to be called from non-hardirq atomic contexts on BH work items. v3: In __flush_work(), test WORK_OFFQ_BH to tell whether a work item being canceled can be busy waited instead of making start_flush_work() return the pool. (Lai) v2: Lai pointed out that __flush_work() was accessing pool->flags outside the RCU critical section protecting the pool pointer. Fix it by testing and remembering the result inside the RCU critical section. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>

workqueue: Allow cancel_work_sync() and disable_work() from atomic contexts on BH work items
Now that work_grab_pending() can always grab the PENDING bit without sleeping, the only thing that prevents allowing cancel_work_sync() of a BH work item from an atomic context is the flushing of the in-flight instance. When we're flushing a BH work item for cancel_work_sync(), we know that the work item is not queued and must be executing in a BH context, which means that it's safe to busy-wait for its completion from a non-hardirq atomic context. This patch updates __flush_work() so that it busy-waits when flushing a BH work item for cancel_work_sync(). might_sleep() is pushed from start_flush_work() to its callers - when operating on a BH work item, __cancel_work_sync() now enforces !in_hardirq() instead of might_sleep(). This allows cancel_work_sync() and disable_work() to be called from non-hardirq atomic contexts on BH work items. v3: In __flush_work(), test WORK_OFFQ_BH to tell whether a work item being canceled can be busy waited instead of making start_flush_work() return the pool. (Lai) v2: Lai pointed out that __flush_work() was accessing pool->flags outside the RCU critical section protecting the pool pointer. Fix it by testing and remembering the result inside the RCU critical section. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>
134874e2 · Tejun Heo · 456a78ee · 134874e2
Commit 134874e2 authored Mar 25, 2024 by Tejun Heo
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 19 deletions

kernel/workqueue.c kernel/workqueue.c +55 -19

No files found.
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4105,8 +4105,6 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 	struct pool_workqueue *pwq;
 	struct workqueue_struct *wq;
-	might_sleep();
 	rcu_read_lock();
 	pool = get_work_pool(work);
 	if (!pool) {
@@ -4158,6 +4156,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 static bool __flush_work(struct work_struct *work, bool from_cancel)
 {
 	struct wq_barrier barr;
+	unsigned long data;
 	if (WARN_ON(!wq_online))
 		return false;
@@ -4165,13 +4164,41 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
 	if (WARN_ON(!work->func))
 		return false;
-	if (start_flush_work(work, &barr, from_cancel)) {
+	if (!start_flush_work(work, &barr, from_cancel))
-		wait_for_completion(&barr.done);
-		destroy_work_on_stack(&barr.work);
-		return true;
-	} else {
 		return false;
+	/*
+	 * start_flush_work() returned %true. If @from_cancel is set, we know
+	 * that @work must have been executing during start_flush_work() and
+	 * can't currently be queued. Its data must contain OFFQ bits. If @work
+	 * was queued on a BH workqueue, we also know that it was running in the
+	 * BH context and thus can be busy-waited.
+	 */
+	data = *work_data_bits(work);
+	if (from_cancel &&
+	    !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) {
+		/*
+		 * On RT, prevent a live lock when %current preempted soft
+		 * interrupt processing or prevents ksoftirqd from running by
+		 * keeping flipping BH. If the BH work item runs on a different
+		 * CPU then this has no effect other than doing the BH
+		 * disable/enable dance for nothing. This is copied from
+		 * kernel/softirq.c::tasklet_unlock_spin_wait().
+		 */
+		while (!try_wait_for_completion(&barr.done)) {
+			if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+				local_bh_disable();
+				local_bh_enable();
+			} else {
+				cpu_relax();
+			}
+		}
+	} else {
+		wait_for_completion(&barr.done);
 	}
+	destroy_work_on_stack(&barr.work);
+	return true;
 }
 /**
@@ -4187,6 +4214,7 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
 */
 bool flush_work(struct work_struct *work)
 {
+	might_sleep();
 	return __flush_work(work, false);
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -4276,6 +4304,11 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
 	ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE);
+	if (*work_data_bits(work) & WORK_OFFQ_BH)
+		WARN_ON_ONCE(in_hardirq());
+	else
+		might_sleep();
 	/*
 	 * Skip __flush_work() during early boot when we know that @work isn't
 	 * executing. This allows canceling during early boot.
@@ -4302,19 +4335,19 @@ EXPORT_SYMBOL(cancel_work);
 * cancel_work_sync - cancel a work and wait for it to finish
 * @work: the work to cancel
 *
- * Cancel @work and wait for its execution to finish.  This function
+ * Cancel @work and wait for its execution to finish. This function can be used
- * can be used even if the work re-queues itself or migrates to
+ * even if the work re-queues itself or migrates to another workqueue. On return
- * another workqueue.  On return from this function, @work is
+ * from this function, @work is guaranteed to be not pending or executing on any
- * guaranteed to be not pending or executing on any CPU.
+ * CPU as long as there aren't racing enqueues.
 *
- * cancel_work_sync(&delayed_work->work) must not be used for
+ * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's.
- * delayed_work's.  Use cancel_delayed_work_sync() instead.
+ * Use cancel_delayed_work_sync() instead.
 *
- * The caller must ensure that the workqueue on which @work was last
+ * Must be called from a sleepable context if @work was last queued on a non-BH
- * queued can't be destroyed before this function returns.
+ * workqueue. Can also be called from non-hardirq atomic contexts including BH
+ * if @work was last queued on a BH workqueue.
 *
- * Return:
+ * Returns %true if @work was pending, %false otherwise.
- * %true if @work was pending, %false otherwise.
 */
 bool cancel_work_sync(struct work_struct *work)
 {
@@ -4384,8 +4417,11 @@ EXPORT_SYMBOL_GPL(disable_work);
 * Similar to disable_work() but also wait for @work to finish if currently
 * executing.
 *
- * Must be called from a sleepable context. Returns %true if @work was pending,
+ * Must be called from a sleepable context if @work was last queued on a non-BH
- * %false otherwise.
+ * workqueue. Can also be called from non-hardirq atomic contexts including BH
+ * if @work was last queued on a BH workqueue.
+ *
+ * Returns %true if @work was pending, %false otherwise.
 */
 bool disable_work_sync(struct work_struct *work)
 {