Commit 00c8e791 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] self-unplugging request queues

The patch teaches a queue to unplug itself:

a) if is has four requests OR
b) if it has had plugged requests for 3 milliseconds.

These numbers may need to be tuned, although doing so doesn't seem to
make much difference.  10 msecs works OK, so HZ=100 machines will be
fine.

Instrumentation shows that about 5-10% of requests were started due to
the three millisecond timeout (during a kernel compile).  That's
somewhat significant.  It means that the kernel is leaving stuff in the
queue, plugged, for too long.  This testing was with a uniprocessor
preemptible kernel, which is particularly vulnerable to unplug latency
(submit some IO, get preempted before the unplug).

This patch permits the removal of a lot of rather lame unplugging in
page reclaim and in the writeback code, which kicks the queues
(globally!) every four megabytes to get writeback underway.

This patch doesn't use blk_run_queues().  It is able to kick just the
particular queue.

The patch is not expected to make much difference really, except for
AIO.  AIO needs a blk_run_queues() in its io_submit() call.  For each
request.  This means that AIO has to disable plugging altogether,
unless something like this patch does it for it.  It means that AIO
will unplug *all* queues in the machine for every io_submit().  Even
against a socket!

This patch was tested by disabling blk_run_queues() completely.  The
system ran OK.

The 3 milliseconds may be too long.  It's OK for the heavy writeback
code, but AIO may want less.  Or maybe AIO really wants zero (ie:
disable plugging).  If that is so, we need new code paths by which AIO
can communicate the "immediate unplug" information - a global unplug is
not good.


To minimise unplug latency due to user CPU load, this patch gives keventd
`nice -10'.  This is of course completely arbitrary.  Really, I think keventd
should be SCHED_RR/MAX_RT_PRIO-1, as it has been in -aa kernels for ages.
parent c5070032
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/slab.h> #include <linux/slab.h>
static void blk_unplug_work(void *data);
/* /*
* For the allocated request tables * For the allocated request tables
*/ */
...@@ -237,6 +239,14 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) ...@@ -237,6 +239,14 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
blk_queue_hardsect_size(q, 512); blk_queue_hardsect_size(q, 512);
blk_queue_dma_alignment(q, 511); blk_queue_dma_alignment(q, 511);
q->unplug_thresh = 4; /* hmm */
q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
if (q->unplug_delay == 0)
q->unplug_delay = 1;
init_timer(&q->unplug_timer);
INIT_WORK(&q->unplug_work, blk_unplug_work, q);
/* /*
* by default assume old behaviour and bounce for any highmem page * by default assume old behaviour and bounce for any highmem page
*/ */
...@@ -960,6 +970,7 @@ void blk_plug_device(request_queue_t *q) ...@@ -960,6 +970,7 @@ void blk_plug_device(request_queue_t *q)
if (!blk_queue_plugged(q)) { if (!blk_queue_plugged(q)) {
spin_lock(&blk_plug_lock); spin_lock(&blk_plug_lock);
list_add_tail(&q->plug_list, &blk_plug_list); list_add_tail(&q->plug_list, &blk_plug_list);
mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
spin_unlock(&blk_plug_lock); spin_unlock(&blk_plug_lock);
} }
} }
...@@ -974,6 +985,7 @@ int blk_remove_plug(request_queue_t *q) ...@@ -974,6 +985,7 @@ int blk_remove_plug(request_queue_t *q)
if (blk_queue_plugged(q)) { if (blk_queue_plugged(q)) {
spin_lock(&blk_plug_lock); spin_lock(&blk_plug_lock);
list_del_init(&q->plug_list); list_del_init(&q->plug_list);
del_timer(&q->unplug_timer);
spin_unlock(&blk_plug_lock); spin_unlock(&blk_plug_lock);
return 1; return 1;
} }
...@@ -992,6 +1004,8 @@ static inline void __generic_unplug_device(request_queue_t *q) ...@@ -992,6 +1004,8 @@ static inline void __generic_unplug_device(request_queue_t *q)
if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
return; return;
del_timer(&q->unplug_timer);
/* /*
* was plugged, fire request_fn if queue has stuff to do * was plugged, fire request_fn if queue has stuff to do
*/ */
...@@ -1020,6 +1034,18 @@ void generic_unplug_device(void *data) ...@@ -1020,6 +1034,18 @@ void generic_unplug_device(void *data)
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
} }
static void blk_unplug_work(void *data)
{
generic_unplug_device(data);
}
static void blk_unplug_timeout(unsigned long data)
{
request_queue_t *q = (request_queue_t *)data;
schedule_work(&q->unplug_work);
}
/** /**
* blk_start_queue - restart a previously stopped queue * blk_start_queue - restart a previously stopped queue
* @q: The &request_queue_t in question * @q: The &request_queue_t in question
...@@ -1164,6 +1190,9 @@ void blk_cleanup_queue(request_queue_t * q) ...@@ -1164,6 +1190,9 @@ void blk_cleanup_queue(request_queue_t * q)
count -= __blk_cleanup_queue(&q->rq[READ]); count -= __blk_cleanup_queue(&q->rq[READ]);
count -= __blk_cleanup_queue(&q->rq[WRITE]); count -= __blk_cleanup_queue(&q->rq[WRITE]);
del_timer_sync(&q->unplug_timer);
flush_scheduled_work();
if (count) if (count)
printk("blk_cleanup_queue: leaked requests (%d)\n", count); printk("blk_cleanup_queue: leaked requests (%d)\n", count);
...@@ -1269,6 +1298,9 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock) ...@@ -1269,6 +1298,9 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
blk_queue_make_request(q, __make_request); blk_queue_make_request(q, __make_request);
blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
q->unplug_timer.function = blk_unplug_timeout;
q->unplug_timer.data = (unsigned long)q;
blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
...@@ -1811,7 +1843,15 @@ static int __make_request(request_queue_t *q, struct bio *bio) ...@@ -1811,7 +1843,15 @@ static int __make_request(request_queue_t *q, struct bio *bio)
out: out:
if (freereq) if (freereq)
__blk_put_request(q, freereq); __blk_put_request(q, freereq);
if (blk_queue_plugged(q)) {
int nr_queued = (queue_nr_requests - q->rq[0].count) +
(queue_nr_requests - q->rq[1].count);
if (nr_queued == q->unplug_thresh)
__generic_unplug_device(q);
}
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
return 0; return 0;
end_io: end_io:
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <linux/major.h> #include <linux/major.h>
#include <linux/genhd.h> #include <linux/genhd.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/wait.h> #include <linux/wait.h>
...@@ -188,6 +190,14 @@ struct request_queue ...@@ -188,6 +190,14 @@ struct request_queue
unplug_fn *unplug_fn; unplug_fn *unplug_fn;
merge_bvec_fn *merge_bvec_fn; merge_bvec_fn *merge_bvec_fn;
/*
* Auto-unplugging state
*/
struct timer_list unplug_timer;
int unplug_thresh; /* After this many requests */
unsigned long unplug_delay; /* After this many jiffies */
struct work_struct unplug_work;
struct backing_dev_info backing_dev_info; struct backing_dev_info backing_dev_info;
/* /*
......
...@@ -177,6 +177,7 @@ static int worker_thread(void *__startup) ...@@ -177,6 +177,7 @@ static int worker_thread(void *__startup)
current->flags |= PF_IOTHREAD; current->flags |= PF_IOTHREAD;
cwq->thread = current; cwq->thread = current;
set_user_nice(current, -10);
set_cpus_allowed(current, 1UL << cpu); set_cpus_allowed(current, 1UL << cpu);
spin_lock_irq(&current->sig->siglock); spin_lock_irq(&current->sig->siglock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment