Commit b5b7bcfa authored by Nikos Tsironis's avatar Nikos Tsironis Committed by Stefan Bader

dm kcopyd: Fix bug causing workqueue stalls

BugLink: https://bugs.launchpad.net/bugs/1818797

[ Upstream commit d7e6b8df ]

When using kcopyd to run callbacks through dm_kcopyd_do_callback() or
submitting copy jobs with a source size of 0, the jobs are pushed
directly to the complete_jobs list, which could be under processing by
the kcopyd thread. As a result, the kcopyd thread can continue running
completed jobs indefinitely, without releasing the CPU, as long as
someone keeps submitting new completed jobs through the aforementioned
paths. Processing of work items, queued for execution on the same CPU as
the currently running kcopyd thread, is thus stalled for excessive
amounts of time, hurting performance.

Running the following test, from the device mapper test suite [1],

  dmtest run --suite snapshot -n parallel_io_to_many_snaps_N

, with 8 active snapshots, we get, in dmesg, messages like the
following:

[68899.948523] BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 95s!
[68899.949282] Showing busy workqueues and worker pools:
[68899.949288] workqueue events: flags=0x0
[68899.949295]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256
[68899.949306]     pending: vmstat_shepherd, cache_reap
[68899.949331] workqueue mm_percpu_wq: flags=0x8
[68899.949337]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949345]     pending: vmstat_update
[68899.949387] workqueue dm_bufio_cache: flags=0x8
[68899.949392]   pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=1/256
[68899.949400]     pending: work_fn [dm_bufio]
[68899.949423] workqueue kcopyd: flags=0x8
[68899.949429]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949437]     pending: do_work [dm_mod]
[68899.949452] workqueue kcopyd: flags=0x8
[68899.949458]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256
[68899.949466]     in-flight: 13:do_work [dm_mod]
[68899.949474]     pending: do_work [dm_mod]
[68899.949487] workqueue kcopyd: flags=0x8
[68899.949493]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949501]     pending: do_work [dm_mod]
[68899.949515] workqueue kcopyd: flags=0x8
[68899.949521]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949529]     pending: do_work [dm_mod]
[68899.949541] workqueue kcopyd: flags=0x8
[68899.949547]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
[68899.949555]     pending: do_work [dm_mod]
[68899.949568] pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=95s workers=4 idle: 27130 27223 1084

Fix this by splitting the complete_jobs list into two parts: A user
facing part, named callback_jobs, and one used internally by kcopyd,
retaining the name complete_jobs. dm_kcopyd_do_callback() and
dispatch_job() now push their jobs to the callback_jobs list, which is
spliced to the complete_jobs list once, every time the kcopyd thread
wakes up. This prevents kcopyd from hogging the CPU indefinitely and
causing workqueue stalls.

Re-running the aforementioned test:

  * Workqueue stalls are eliminated
  * The maximum writing time among all targets is reduced from 09m37.10s
    to 06m04.85s and the total run time of the test is reduced from
    10m43.591s to 7m19.199s

[1] https://github.com/jthornber/device-mapper-test-suiteSigned-off-by: default avatarNikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: default avatarIlias Tsitsimpis <iliastsi@arrikto.com>
Signed-off-by: default avatarMike Snitzer <snitzer@redhat.com>
Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
Signed-off-by: default avatarJuerg Haefliger <juergh@canonical.com>
Signed-off-by: default avatarKhalid Elmously <khalid.elmously@canonical.com>
parent 4fdbfb93
...@@ -55,15 +55,17 @@ struct dm_kcopyd_client { ...@@ -55,15 +55,17 @@ struct dm_kcopyd_client {
struct dm_kcopyd_throttle *throttle; struct dm_kcopyd_throttle *throttle;
/* /*
* We maintain three lists of jobs: * We maintain four lists of jobs:
* *
* i) jobs waiting for pages * i) jobs waiting for pages
* ii) jobs that have pages, and are waiting for the io to be issued. * ii) jobs that have pages, and are waiting for the io to be issued.
* iii) jobs that have completed. * iii) jobs that don't need to do any IO and just run a callback
* iv) jobs that have completed.
* *
* All three of these are protected by job_lock. * All four of these are protected by job_lock.
*/ */
spinlock_t job_lock; spinlock_t job_lock;
struct list_head callback_jobs;
struct list_head complete_jobs; struct list_head complete_jobs;
struct list_head io_jobs; struct list_head io_jobs;
struct list_head pages_jobs; struct list_head pages_jobs;
...@@ -583,6 +585,7 @@ static void do_work(struct work_struct *work) ...@@ -583,6 +585,7 @@ static void do_work(struct work_struct *work)
struct dm_kcopyd_client *kc = container_of(work, struct dm_kcopyd_client *kc = container_of(work,
struct dm_kcopyd_client, kcopyd_work); struct dm_kcopyd_client, kcopyd_work);
struct blk_plug plug; struct blk_plug plug;
unsigned long flags;
/* /*
* The order that these are called is *very* important. * The order that these are called is *very* important.
...@@ -591,6 +594,10 @@ static void do_work(struct work_struct *work) ...@@ -591,6 +594,10 @@ static void do_work(struct work_struct *work)
* list. io jobs call wake when they complete and it all * list. io jobs call wake when they complete and it all
* starts again. * starts again.
*/ */
spin_lock_irqsave(&kc->job_lock, flags);
list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
spin_unlock_irqrestore(&kc->job_lock, flags);
blk_start_plug(&plug); blk_start_plug(&plug);
process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->complete_jobs, kc, run_complete_job);
process_jobs(&kc->pages_jobs, kc, run_pages_job); process_jobs(&kc->pages_jobs, kc, run_pages_job);
...@@ -608,7 +615,7 @@ static void dispatch_job(struct kcopyd_job *job) ...@@ -608,7 +615,7 @@ static void dispatch_job(struct kcopyd_job *job)
struct dm_kcopyd_client *kc = job->kc; struct dm_kcopyd_client *kc = job->kc;
atomic_inc(&kc->nr_jobs); atomic_inc(&kc->nr_jobs);
if (unlikely(!job->source.count)) if (unlikely(!job->source.count))
push(&kc->complete_jobs, job); push(&kc->callback_jobs, job);
else if (job->pages == &zero_page_list) else if (job->pages == &zero_page_list)
push(&kc->io_jobs, job); push(&kc->io_jobs, job);
else else
...@@ -795,7 +802,7 @@ void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) ...@@ -795,7 +802,7 @@ void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
job->read_err = read_err; job->read_err = read_err;
job->write_err = write_err; job->write_err = write_err;
push(&kc->complete_jobs, job); push(&kc->callback_jobs, job);
wake(kc); wake(kc);
} }
EXPORT_SYMBOL(dm_kcopyd_do_callback); EXPORT_SYMBOL(dm_kcopyd_do_callback);
...@@ -825,6 +832,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro ...@@ -825,6 +832,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
spin_lock_init(&kc->job_lock); spin_lock_init(&kc->job_lock);
INIT_LIST_HEAD(&kc->callback_jobs);
INIT_LIST_HEAD(&kc->complete_jobs); INIT_LIST_HEAD(&kc->complete_jobs);
INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->io_jobs);
INIT_LIST_HEAD(&kc->pages_jobs); INIT_LIST_HEAD(&kc->pages_jobs);
...@@ -874,6 +882,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) ...@@ -874,6 +882,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
/* Wait for completion of all jobs submitted by this client. */ /* Wait for completion of all jobs submitted by this client. */
wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
BUG_ON(!list_empty(&kc->callback_jobs));
BUG_ON(!list_empty(&kc->complete_jobs)); BUG_ON(!list_empty(&kc->complete_jobs));
BUG_ON(!list_empty(&kc->io_jobs)); BUG_ON(!list_empty(&kc->io_jobs));
BUG_ON(!list_empty(&kc->pages_jobs)); BUG_ON(!list_empty(&kc->pages_jobs));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment