Commit 8b712842 authored by Chris Mason's avatar Chris Mason

Btrfs: Add async worker threads for pre and post IO checksumming

Btrfs has been using workqueues to spread the checksumming load across
other CPUs in the system.  But, workqueues only schedule work on the
same CPU that queued the work, giving them a limited benefit for systems with
higher CPU counts.

This code adds a generic facility to schedule work with pools of kthreads,
and changes the bio submission code to queue bios up.  The queueing is
important to make sure large numbers of procs on the system don't
turn streaming workloads into random workloads by sending IO down
concurrently.

The end result of all of this is much higher performance (and CPU usage) when
doing checksumming on large machines.  Two worker pools are created,
one for writes and one for endio processing.  The two could deadlock if
we tried to service both from a single pool.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 43e570b0
......@@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
hash.o file-item.o inode-item.o inode-map.o disk-io.o \
transaction.o bit-radix.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o
extent_io.o volumes.o async-thread.o
btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o
else
......
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
#include "async-thread.h"
/*
* container for the kthread task pointer and the list of pending work
* One of these is allocated per thread.
*/
struct btrfs_worker_thread {
/* list of struct btrfs_work that are waiting for service */
struct list_head pending;
/* list of worker threads from struct btrfs_workers */
struct list_head worker_list;
/* kthread */
struct task_struct *task;
/* number of things on the pending list */
atomic_t num_pending;
/* protects the pending list. */
spinlock_t lock;
/* set to non-zero when this thread is already awake and kicking */
int working;
};
/*
* main loop for servicing work items
*/
static int worker_loop(void *arg)
{
struct btrfs_worker_thread *worker = arg;
struct list_head *cur;
struct btrfs_work *work;
do {
spin_lock_irq(&worker->lock);
while(!list_empty(&worker->pending)) {
cur = worker->pending.next;
work = list_entry(cur, struct btrfs_work, list);
list_del(&work->list);
clear_bit(0, &work->flags);
work->worker = worker;
spin_unlock_irq(&worker->lock);
work->func(work);
atomic_dec(&worker->num_pending);
spin_lock_irq(&worker->lock);
}
worker->working = 0;
if (freezing(current)) {
refrigerator();
} else {
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_irq(&worker->lock);
schedule();
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
return 0;
}
/*
* this will wait for all the worker threads to shutdown
*/
int btrfs_stop_workers(struct btrfs_workers *workers)
{
struct list_head *cur;
struct btrfs_worker_thread *worker;
while(!list_empty(&workers->worker_list)) {
cur = workers->worker_list.next;
worker = list_entry(cur, struct btrfs_worker_thread,
worker_list);
kthread_stop(worker->task);
list_del(&worker->worker_list);
kfree(worker);
}
return 0;
}
/*
* simple init on struct btrfs_workers
*/
void btrfs_init_workers(struct btrfs_workers *workers, int max)
{
workers->num_workers = 0;
INIT_LIST_HEAD(&workers->worker_list);
workers->last = NULL;
spin_lock_init(&workers->lock);
workers->max_workers = max;
}
/*
* starts new worker threads. This does not enforce the max worker
* count in case you need to temporarily go past it.
*/
int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
{
struct btrfs_worker_thread *worker;
int ret = 0;
int i;
for (i = 0; i < num_workers; i++) {
worker = kzalloc(sizeof(*worker), GFP_NOFS);
if (!worker) {
ret = -ENOMEM;
goto fail;
}
INIT_LIST_HEAD(&worker->pending);
INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock);
atomic_set(&worker->num_pending, 0);
worker->task = kthread_run(worker_loop, worker, "btrfs");
if (IS_ERR(worker->task)) {
ret = PTR_ERR(worker->task);
goto fail;
}
spin_lock_irq(&workers->lock);
list_add_tail(&worker->worker_list, &workers->worker_list);
workers->last = worker;
workers->num_workers++;
spin_unlock_irq(&workers->lock);
}
return 0;
fail:
btrfs_stop_workers(workers);
return ret;
}
/*
* run through the list and find a worker thread that doesn't have a lot
* to do right now. This can return null if we aren't yet at the thread
* count limit and all of the threads are busy.
*/
static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
struct list_head *next;
struct list_head *start;
int enforce_min = workers->num_workers < workers->max_workers;
/* start with the last thread if it isn't busy */
worker = workers->last;
if (atomic_read(&worker->num_pending) < 64)
goto done;
next = worker->worker_list.next;
start = &worker->worker_list;
/*
* check all the workers for someone that is bored. FIXME, do
* something smart here
*/
while(next != start) {
if (next == &workers->worker_list) {
next = workers->worker_list.next;
continue;
}
worker = list_entry(next, struct btrfs_worker_thread,
worker_list);
if (atomic_read(&worker->num_pending) < 64 || !enforce_min)
goto done;
next = next->next;
}
/*
* nobody was bored, if we're already at the max thread count,
* use the last thread
*/
if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) {
return workers->last;
}
return NULL;
done:
workers->last = worker;
return worker;
}
static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
again:
spin_lock_irqsave(&workers->lock, flags);
worker = next_worker(workers);
spin_unlock_irqrestore(&workers->lock, flags);
if (!worker) {
spin_lock_irqsave(&workers->lock, flags);
if (workers->num_workers >= workers->max_workers) {
/*
* we have failed to find any workers, just
* return the force one
*/
worker = list_entry(workers->worker_list.next,
struct btrfs_worker_thread, worker_list);
spin_unlock_irqrestore(&workers->lock, flags);
} else {
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
btrfs_start_workers(workers, 1);
goto again;
}
}
return worker;
}
/*
* btrfs_requeue_work just puts the work item back on the tail of the list
* it was taken from. It is intended for use with long running work functions
* that make some progress and want to give the cpu up for others.
*/
int btrfs_requeue_work(struct btrfs_work *work)
{
struct btrfs_worker_thread *worker = work->worker;
unsigned long flags;
if (test_and_set_bit(0, &work->flags))
goto out;
spin_lock_irqsave(&worker->lock, flags);
atomic_inc(&worker->num_pending);
list_add_tail(&work->list, &worker->pending);
spin_unlock_irqrestore(&worker->lock, flags);
out:
return 0;
}
/*
* places a struct btrfs_work into the pending queue of one of the kthreads
*/
int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
int wake = 0;
/* don't requeue something already on a list */
if (test_and_set_bit(0, &work->flags))
goto out;
worker = find_worker(workers);
spin_lock_irqsave(&worker->lock, flags);
atomic_inc(&worker->num_pending);
list_add_tail(&work->list, &worker->pending);
/*
* avoid calling into wake_up_process if this thread has already
* been kicked
*/
if (!worker->working)
wake = 1;
worker->working = 1;
spin_unlock_irqrestore(&worker->lock, flags);
if (wake)
wake_up_process(worker->task);
out:
return 0;
}
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef __BTRFS_ASYNC_THREAD_
#define __BTRFS_ASYNC_THREAD_
struct btrfs_worker_thread;
/*
* This is similar to a workqueue, but it is meant to spread the operations
* across all available cpus instead of just the CPU that was used to
* queue the work. There is also some batching introduced to try and
* cut down on context switches.
*
* By default threads are added on demand up to 2 * the number of cpus.
* Changing struct btrfs_workers->max_workers is one way to prevent
* demand creation of kthreads.
*
* the basic model of these worker threads is to embed a btrfs_work
* structure in your own data struct, and use container_of in a
* work function to get back to your data struct.
*/
struct btrfs_work {
/*
* only func should be set to the function you want called
* your work struct is passed as the only arg
*/
void (*func)(struct btrfs_work *work);
/*
* flags should be set to zero. It is used to make sure the
* struct is only inserted once into the list.
*/
unsigned long flags;
/* don't touch these */
struct btrfs_worker_thread *worker;
struct list_head list;
};
struct btrfs_workers {
/* current number of running workers */
int num_workers;
/* max number of workers allowed. changed by btrfs_start_workers */
int max_workers;
/* list with all the work threads */
struct list_head worker_list;
/* the last worker thread to have something queued */
struct btrfs_worker_thread *last;
/* lock for finding the next worker thread to queue on */
spinlock_t lock;
};
int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
int btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, int max);
int btrfs_requeue_work(struct btrfs_work *work);
#endif
......@@ -30,6 +30,7 @@
#include "bit-radix.h"
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
......@@ -518,13 +519,20 @@ struct btrfs_fs_info {
struct list_head hashers;
struct list_head dead_roots;
struct list_head end_io_work_list;
struct list_head async_submit_work_list;
struct work_struct end_io_work;
struct work_struct async_submit_work;
spinlock_t end_io_work_lock;
spinlock_t async_submit_work_lock;
atomic_t nr_async_submits;
/*
* there is a pool of worker threads for checksumming during writes
* and a pool for checksumming after reads. This is because readers
* can run with FS locks held, and the writers may be waiting for
* those locks. We don't want ordering in the pending list to cause
* deadlocks, and so the two are serviced separately.
*/
struct btrfs_workers workers;
struct btrfs_workers endio_workers;
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
struct work_struct trans_work;
#else
......
This diff is collapsed.
......@@ -359,7 +359,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
kfree(sums);
return btrfs_map_bio(root, rw, bio, mirror_num);
return btrfs_map_bio(root, rw, bio, mirror_num, 1);
}
int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
......@@ -383,7 +383,7 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
inode, rw, bio, mirror_num,
__btrfs_submit_bio_hook);
mapit:
return btrfs_map_bio(root, rw, bio, mirror_num);
return btrfs_map_bio(root, rw, bio, mirror_num, 0);
}
int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
......
......@@ -27,6 +27,7 @@
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
struct map_lookup {
u64 type;
......@@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
return NULL;
}
/*
* we try to collect pending bios for a device so we don't get a large
* number of procs sending bios down to the same device. This greatly
* improves the schedulers ability to collect and merge the bios.
*
* But, it also turns into a long list of bios to process and that is sure
* to eventually make the worker thread block. The solution here is to
* make some progress and then put this work struct back at the end of
* the list if the block device is congested. This way, multiple devices
* can make progress from a single worker thread.
*/
int run_scheduled_bios(struct btrfs_device *device)
{
struct bio *pending;
struct backing_dev_info *bdi;
struct bio *tail;
struct bio *cur;
int again = 0;
unsigned long num_run = 0;
bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
loop:
spin_lock(&device->io_lock);
/* take all the bios off the list at once and process them
* later on (without the lock held). But, remember the
* tail and other pointers so the bios can be properly reinserted
* into the list if we hit congestion
*/
pending = device->pending_bios;
tail = device->pending_bio_tail;
WARN_ON(pending && !tail);
device->pending_bios = NULL;
device->pending_bio_tail = NULL;
/*
* if pending was null this time around, no bios need processing
* at all and we can stop. Otherwise it'll loop back up again
* and do an additional check so no bios are missed.
*
* device->running_pending is used to synchronize with the
* schedule_bio code.
*/
if (pending) {
again = 1;
device->running_pending = 1;
} else {
again = 0;
device->running_pending = 0;
}
spin_unlock(&device->io_lock);
while(pending) {
cur = pending;
pending = pending->bi_next;
cur->bi_next = NULL;
atomic_dec(&device->dev_root->fs_info->nr_async_submits);
submit_bio(cur->bi_rw, cur);
num_run++;
/*
* we made progress, there is more work to do and the bdi
* is now congested. Back off and let other work structs
* run instead
*/
if (pending && num_run && bdi_write_congested(bdi)) {
struct bio *old_head;
spin_lock(&device->io_lock);
old_head = device->pending_bios;
device->pending_bios = pending;
if (device->pending_bio_tail)
tail->bi_next = old_head;
else
device->pending_bio_tail = tail;
spin_unlock(&device->io_lock);
btrfs_requeue_work(&device->work);
goto done;
}
}
if (again)
goto loop;
done:
return 0;
}
void pending_bios_fn(struct btrfs_work *work)
{
struct btrfs_device *device;
device = container_of(work, struct btrfs_device, work);
run_scheduled_bios(device);
}
static int device_list_add(const char *path,
struct btrfs_super_block *disk_super,
u64 devid, struct btrfs_fs_devices **fs_devices_ret)
......@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
return -ENOMEM;
}
device->devid = devid;
device->work.func = pending_bios_fn;
memcpy(device->uuid, disk_super->dev_item.uuid,
BTRFS_UUID_SIZE);
device->barriers = 1;
......@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
}
device->barriers = 1;
device->work.func = pending_bios_fn;
generate_random_uuid(device->uuid);
spin_lock_init(&device->io_lock);
device->name = kstrdup(device_path, GFP_NOFS);
......@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
#endif
}
struct async_sched {
struct bio *bio;
int rw;
struct btrfs_fs_info *info;
struct btrfs_work work;
};
/*
* see run_scheduled_bios for a description of why bios are collected for
* async submit.
*
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
int rw, struct bio *bio)
{
int should_queue = 1;
/* don't bother with additional async steps for reads, right now */
if (!(rw & (1 << BIO_RW))) {
submit_bio(rw, bio);
return 0;
}
/*
* nr_async_sumbits allows us to reliably return congestion to the
* higher layers. Otherwise, the async bio makes it appear we have
* made progress against dirty pages when we've really just put it
* on a queue for later
*/
atomic_inc(&root->fs_info->nr_async_submits);
bio->bi_next = NULL;
bio->bi_rw |= rw;
spin_lock(&device->io_lock);
if (device->pending_bio_tail)
device->pending_bio_tail->bi_next = bio;
device->pending_bio_tail = bio;
if (!device->pending_bios)
device->pending_bios = bio;
if (device->running_pending)
should_queue = 0;
spin_unlock(&device->io_lock);
if (should_queue)
btrfs_queue_worker(&root->fs_info->workers, &device->work);
return 0;
}
int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
int mirror_num)
int mirror_num, int async_submit)
{
struct btrfs_mapping_tree *map_tree;
struct btrfs_device *dev;
......@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
dev = multi->stripes[dev_nr].dev;
if (dev && dev->bdev) {
bio->bi_bdev = dev->bdev;
spin_lock(&dev->io_lock);
dev->total_ios++;
spin_unlock(&dev->io_lock);
submit_bio(rw, bio);
if (async_submit)
schedule_bio(root, dev, rw, bio);
else
submit_bio(rw, bio);
} else {
bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
bio->bi_sector = logical >> 9;
......@@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
device->barriers = 1;
device->dev_root = root->fs_info->dev_root;
device->devid = devid;
device->work.func = pending_bios_fn;
fs_devices->num_devices++;
spin_lock_init(&device->io_lock);
memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
......
......@@ -20,6 +20,7 @@
#define __BTRFS_VOLUMES_
#include <linux/bio.h>
#include "async-thread.h"
struct buffer_head;
struct btrfs_device {
......@@ -27,6 +28,9 @@ struct btrfs_device {
struct list_head dev_alloc_list;
struct btrfs_root *dev_root;
struct buffer_head *pending_io;
struct bio *pending_bios;
struct bio *pending_bio_tail;
int running_pending;
u64 generation;
int barriers;
......@@ -36,8 +40,6 @@ struct btrfs_device {
struct block_device *bdev;
u64 total_ios;
char *name;
/* the internal btrfs device id */
......@@ -63,6 +65,8 @@ struct btrfs_device {
/* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE];
struct btrfs_work work;
};
struct btrfs_fs_devices {
......@@ -117,7 +121,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
int mirror_num);
int mirror_num, int async_submit);
int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int flags, void *holder);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment