Commit 7ab044a4 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'wq-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Pull workqueue updates from Tejun Heo:

 - Concurrency-managed per-cpu work items that hog CPUs and delay the
   execution of other work items are now automatically detected and
   excluded from concurrency management. Reporting on such work items
   can also be enabled through a config option.

 - Added tools/workqueue/wq_monitor.py which improves visibility into
   workqueue usages and behaviors.

 - Arnd's minimal fix for gcc-13 enum warning on 32bit compiles,
   superseded by commit afa4bb77 in mainline.

* tag 'wq-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
  workqueue: Disable per-cpu CPU hog detection when wq_cpu_intensive_thresh_us is 0
  workqueue: Fix WARN_ON_ONCE() triggers in worker_enter_idle()
  workqueue: fix enum type for gcc-13
  workqueue: Track and monitor per-workqueue CPU time usage
  workqueue: Report work funcs that trigger automatic CPU_INTENSIVE mechanism
  workqueue: Automatically mark CPU-hogging work items CPU_INTENSIVE
  workqueue: Improve locking rule description for worker fields
  workqueue: Move worker_set/clr_flags() upwards
  workqueue: Re-order struct worker fields
  workqueue: Add pwq->stats[] and a monitoring script
  Further upgrade queue_work_on() comment
parents 18eb3b6d 18c8ae81
......@@ -6972,6 +6972,18 @@
it can be updated at runtime by writing to the
corresponding sysfs file.
workqueue.cpu_intensive_thresh_us=
Per-cpu work items which run for longer than this
threshold are automatically considered CPU intensive
and excluded from concurrency management to prevent
them from noticeably delaying other per-cpu work
items. Default is 10000 (10ms).
If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel
will report the work functions which violate this
threshold repeatedly. They are likely good
candidates for using WQ_UNBOUND workqueues instead.
workqueue.disable_numa
By default, all work items queued to unbound
workqueues are affine to the NUMA nodes they're
......
......@@ -348,6 +348,37 @@ Guidelines
level of locality in wq operations and work item execution.
Monitoring
==========
Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::
$ tools/workqueue/wq_monitor.py events
total infl CPUtime CPUhog CMwake mayday rescued
events 18545 0 6.1 0 5 - -
events_highpri 8 0 0.0 0 0 - -
events_long 3 0 0.0 0 0 - -
events_unbound 38306 0 0.1 - - - -
events_freezable 0 0 0.0 0 0 - -
events_power_efficient 29598 0 0.2 0 0 - -
events_freezable_power_ 10 0 0.0 0 0 - -
sock_diag_events 0 0 0.0 0 0 - -
total infl CPUtime CPUhog CMwake mayday rescued
events 18548 0 6.1 0 5 - -
events_highpri 8 0 0.0 0 0 - -
events_long 3 0 0.0 0 0 - -
events_unbound 38322 0 0.1 - - - -
events_freezable 0 0 0.0 0 0 - -
events_power_efficient 29603 0 0.2 0 0 - -
events_freezable_power_ 10 0 0.0 0 0 - -
sock_diag_events 0 0 0.0 0 0 - -
...
See the command's help message for more info.
Debugging
=========
......@@ -387,6 +418,7 @@ the stack trace of the offending worker thread. ::
The work item's function should be trivially visible in the stack
trace.
Non-reentrance Conditions
=========================
......
......@@ -5670,6 +5670,9 @@ void scheduler_tick(void)
perf_event_task_tick();
if (curr->flags & PF_WQ_WORKER)
wq_worker_tick(curr);
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
......
This diff is collapsed.
......@@ -28,13 +28,18 @@ struct worker {
struct hlist_node hentry; /* L: while busy */
};
struct work_struct *current_work; /* L: work being processed */
work_func_t current_func; /* L: current_work's fn */
struct pool_workqueue *current_pwq; /* L: current_work's pwq */
unsigned int current_color; /* L: current_work's color */
struct list_head scheduled; /* L: scheduled works */
struct work_struct *current_work; /* K: work being processed and its */
work_func_t current_func; /* K: function */
struct pool_workqueue *current_pwq; /* K: pwq */
u64 current_at; /* K: runtime at start or last wakeup */
unsigned int current_color; /* K: color */
int sleeping; /* S: is worker sleeping? */
/* 64 bytes boundary on 64bit, 32 on 32bit */
/* used by the scheduler to determine a worker's last known identity */
work_func_t last_func; /* K: last work's fn */
struct list_head scheduled; /* L: scheduled works */
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* A: the associated pool */
......@@ -42,10 +47,9 @@ struct worker {
struct list_head node; /* A: anchored at pool->workers */
/* A: runs through worker->node */
unsigned long last_active; /* L: last active timestamp */
unsigned long last_active; /* K: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
int sleeping; /* None */
/*
* Opaque string set with work_set_desc(). Printed out with task
......@@ -55,9 +59,6 @@ struct worker {
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
/* used by the scheduler to determine a worker's last known identity */
work_func_t last_func;
};
/**
......@@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
*/
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
......@@ -1134,6 +1134,19 @@ config WQ_WATCHDOG
state. This can be configured through kernel parameter
"workqueue.watchdog_thresh" and its sysfs counterpart.
config WQ_CPU_INTENSIVE_REPORT
bool "Report per-cpu work items which hog CPU for too long"
depends on DEBUG_KERNEL
help
Say Y here to enable reporting of concurrency-managed per-cpu work
items that hog CPUs for longer than
workqueue.cpu_intensive_threshold_us. Workqueue automatically
detects and excludes them from concurrency management to prevent
them from stalling other per-cpu work items. Occassional
triggering may not necessarily indicate a problem. Repeated
triggering likely indicates that the work item should be switched
to use an unbound workqueue.
config TEST_LOCKUP
tristate "Test module to generate lockups"
depends on m
......
#!/usr/bin/env drgn
#
# Copyright (C) 2023 Tejun Heo <tj@kernel.org>
# Copyright (C) 2023 Meta Platforms, Inc. and affiliates.
desc = """
This is a drgn script to monitor workqueues. For more info on drgn, visit
https://github.com/osandov/drgn.
total Total number of work items executed by the workqueue.
infl The number of currently in-flight work items.
CPUtime Total CPU time consumed by the workqueue in seconds. This is
sampled from scheduler ticks and only provides ballpark
measurement. "nohz_full=" CPUs are excluded from measurement.
CPUitsv The number of times a concurrency-managed work item hogged CPU
longer than the threshold (workqueue.cpu_intensive_thresh_us)
and got excluded from concurrency management to avoid stalling
other work items.
CMwake The number of concurrency-management wake-ups while executing a
work item of the workqueue.
mayday The number of times the rescuer was requested while waiting for
new worker creation.
rescued The number of work items executed by the rescuer.
"""
import sys
import signal
import os
import re
import time
import json
import drgn
from drgn.helpers.linux.list import list_for_each_entry,list_empty
from drgn.helpers.linux.cpumask import for_each_possible_cpu
import argparse
parser = argparse.ArgumentParser(description=desc,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('workqueue', metavar='REGEX', nargs='*',
help='Target workqueue name patterns (all if empty)')
parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
help='Monitoring interval (0 to print once and exit)')
parser.add_argument('-j', '--json', action='store_true',
help='Output in json')
args = parser.parse_args()
def err(s):
print(s, file=sys.stderr, flush=True)
sys.exit(1)
workqueues = prog['workqueues']
WQ_UNBOUND = prog['WQ_UNBOUND']
WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM']
PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution
PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution
PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed
PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
PWQ_NR_STATS = prog['PWQ_NR_STATS']
class WqStats:
def __init__(self, wq):
self.name = wq.name.string_().decode()
self.unbound = wq.flags & WQ_UNBOUND != 0
self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0
self.stats = [0] * PWQ_NR_STATS
for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'):
for i in range(PWQ_NR_STATS):
self.stats[i] += int(pwq.stats[i])
def dict(self, now):
return { 'timestamp' : now,
'name' : self.name,
'unbound' : self.unbound,
'mem_reclaim' : self.mem_reclaim,
'started' : self.stats[PWQ_STAT_STARTED],
'completed' : self.stats[PWQ_STAT_COMPLETED],
'cpu_time' : self.stats[PWQ_STAT_CPU_TIME],
'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE],
'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
'mayday' : self.stats[PWQ_STAT_MAYDAY],
'rescued' : self.stats[PWQ_STAT_RESCUED], }
def table_header_str():
return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
def table_row_str(self):
cpu_intensive = '-'
cm_wakeup = '-'
mayday = '-'
rescued = '-'
if not self.unbound:
cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
if self.mem_reclaim:
mayday = str(self.stats[PWQ_STAT_MAYDAY])
rescued = str(self.stats[PWQ_STAT_RESCUED])
out = f'{self.name[-24:]:24} ' \
f'{self.stats[PWQ_STAT_STARTED]:8} ' \
f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
f'{cpu_intensive:>7} ' \
f'{cm_wakeup:>7} ' \
f'{mayday:>7} ' \
f'{rescued:>7} '
return out.rstrip(':')
exit_req = False
def sigint_handler(signr, frame):
global exit_req
exit_req = True
def main():
# handle args
table_fmt = not args.json
interval = args.interval
re_str = None
if args.workqueue:
for r in args.workqueue:
if re_str is None:
re_str = r
else:
re_str += '|' + r
filter_re = re.compile(re_str) if re_str else None
# monitoring loop
signal.signal(signal.SIGINT, sigint_handler)
while not exit_req:
now = time.time()
if table_fmt:
print()
print(WqStats.table_header_str())
for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'):
stats = WqStats(wq)
if filter_re and not filter_re.search(stats.name):
continue
if table_fmt:
print(stats.table_row_str())
else:
print(stats.dict(now))
if interval == 0:
break
time.sleep(interval)
if __name__ == "__main__":
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment