Commit e5e9a520 authored by Chris Mason's avatar Chris Mason

Btrfs: avoid races between super writeout and device list updates

On multi-device filesystems, btrfs writes supers to all of the devices
before considering a sync complete.  There wasn't any additional
locking between super writeout and the device list management code
because device management was done inside a transaction and
super writeout only happened  with no transation writers running.

With the btrfs fsync log and other async transaction updates, this
has been racey for some time.  This adds a mutex to protect
the device list.  The existing volume mutex could not be reused due to
transaction lock ordering requirements.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 7df336ec
...@@ -2111,7 +2111,7 @@ static int write_dev_supers(struct btrfs_device *device, ...@@ -2111,7 +2111,7 @@ static int write_dev_supers(struct btrfs_device *device,
int write_all_supers(struct btrfs_root *root, int max_mirrors) int write_all_supers(struct btrfs_root *root, int max_mirrors)
{ {
struct list_head *head = &root->fs_info->fs_devices->devices; struct list_head *head;
struct btrfs_device *dev; struct btrfs_device *dev;
struct btrfs_super_block *sb; struct btrfs_super_block *sb;
struct btrfs_dev_item *dev_item; struct btrfs_dev_item *dev_item;
...@@ -2126,6 +2126,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) ...@@ -2126,6 +2126,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
sb = &root->fs_info->super_for_commit; sb = &root->fs_info->super_for_commit;
dev_item = &sb->dev_item; dev_item = &sb->dev_item;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
list_for_each_entry(dev, head, dev_list) { list_for_each_entry(dev, head, dev_list) {
if (!dev->bdev) { if (!dev->bdev) {
total_errors++; total_errors++;
...@@ -2169,6 +2172,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) ...@@ -2169,6 +2172,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (ret) if (ret)
total_errors++; total_errors++;
} }
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) { if (total_errors > max_errors) {
printk(KERN_ERR "btrfs: %d errors while writing supers\n", printk(KERN_ERR "btrfs: %d errors while writing supers\n",
total_errors); total_errors);
......
...@@ -377,6 +377,7 @@ static noinline int device_list_add(const char *path, ...@@ -377,6 +377,7 @@ static noinline int device_list_add(const char *path,
memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
fs_devices->latest_devid = devid; fs_devices->latest_devid = devid;
fs_devices->latest_trans = found_transid; fs_devices->latest_trans = found_transid;
mutex_init(&fs_devices->device_list_mutex);
device = NULL; device = NULL;
} else { } else {
device = __find_device(&fs_devices->devices, devid, device = __find_device(&fs_devices->devices, devid,
...@@ -403,7 +404,11 @@ static noinline int device_list_add(const char *path, ...@@ -403,7 +404,11 @@ static noinline int device_list_add(const char *path,
return -ENOMEM; return -ENOMEM;
} }
INIT_LIST_HEAD(&device->dev_alloc_list); INIT_LIST_HEAD(&device->dev_alloc_list);
mutex_lock(&fs_devices->device_list_mutex);
list_add(&device->dev_list, &fs_devices->devices); list_add(&device->dev_list, &fs_devices->devices);
mutex_unlock(&fs_devices->device_list_mutex);
device->fs_devices = fs_devices; device->fs_devices = fs_devices;
fs_devices->num_devices++; fs_devices->num_devices++;
} }
...@@ -429,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) ...@@ -429,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
INIT_LIST_HEAD(&fs_devices->devices); INIT_LIST_HEAD(&fs_devices->devices);
INIT_LIST_HEAD(&fs_devices->alloc_list); INIT_LIST_HEAD(&fs_devices->alloc_list);
INIT_LIST_HEAD(&fs_devices->list); INIT_LIST_HEAD(&fs_devices->list);
mutex_init(&fs_devices->device_list_mutex);
fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_devid = orig->latest_devid;
fs_devices->latest_trans = orig->latest_trans; fs_devices->latest_trans = orig->latest_trans;
memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
mutex_lock(&orig->device_list_mutex);
list_for_each_entry(orig_dev, &orig->devices, dev_list) { list_for_each_entry(orig_dev, &orig->devices, dev_list) {
device = kzalloc(sizeof(*device), GFP_NOFS); device = kzalloc(sizeof(*device), GFP_NOFS);
if (!device) if (!device)
...@@ -454,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) ...@@ -454,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device->fs_devices = fs_devices; device->fs_devices = fs_devices;
fs_devices->num_devices++; fs_devices->num_devices++;
} }
mutex_unlock(&orig->device_list_mutex);
return fs_devices; return fs_devices;
error: error:
mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices); free_fs_devices(fs_devices);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
...@@ -466,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) ...@@ -466,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&uuid_mutex); mutex_lock(&uuid_mutex);
again: again:
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->in_fs_metadata) if (device->in_fs_metadata)
continue; continue;
...@@ -485,6 +495,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) ...@@ -485,6 +495,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
kfree(device->name); kfree(device->name);
kfree(device); kfree(device);
} }
mutex_unlock(&fs_devices->device_list_mutex);
if (fs_devices->seed) { if (fs_devices->seed) {
fs_devices = fs_devices->seed; fs_devices = fs_devices->seed;
...@@ -1135,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ...@@ -1135,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
device = NULL; device = NULL;
devices = &root->fs_info->fs_devices->devices; devices = &root->fs_info->fs_devices->devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_for_each_entry(tmp, devices, dev_list) { list_for_each_entry(tmp, devices, dev_list) {
if (tmp->in_fs_metadata && !tmp->bdev) { if (tmp->in_fs_metadata && !tmp->bdev) {
device = tmp; device = tmp;
break; break;
} }
} }
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
bdev = NULL; bdev = NULL;
bh = NULL; bh = NULL;
disk_super = NULL; disk_super = NULL;
...@@ -1195,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ...@@ -1195,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
goto error_brelse; goto error_brelse;
device->in_fs_metadata = 0; device->in_fs_metadata = 0;
/*
* the device list mutex makes sure that we don't change
* the device list while someone else is writing out all
* the device supers.
*/
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_del_init(&device->dev_list); list_del_init(&device->dev_list);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
device->fs_devices->num_devices--; device->fs_devices->num_devices--;
next_device = list_entry(root->fs_info->fs_devices->devices.next, next_device = list_entry(root->fs_info->fs_devices->devices.next,
...@@ -1289,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, ...@@ -1289,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
seed_devices->opened = 1; seed_devices->opened = 1;
INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->devices);
INIT_LIST_HEAD(&seed_devices->alloc_list); INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
list_splice_init(&fs_devices->devices, &seed_devices->devices); list_splice_init(&fs_devices->devices, &seed_devices->devices);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
list_for_each_entry(device, &seed_devices->devices, dev_list) { list_for_each_entry(device, &seed_devices->devices, dev_list) {
...@@ -1414,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ...@@ -1414,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
mutex_lock(&root->fs_info->volume_mutex); mutex_lock(&root->fs_info->volume_mutex);
devices = &root->fs_info->fs_devices->devices; devices = &root->fs_info->fs_devices->devices;
/*
* we have the volume lock, so we don't need the extra
* device list mutex while reading the list here.
*/
list_for_each_entry(device, devices, dev_list) { list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) { if (device->bdev == bdev) {
ret = -EEXIST; ret = -EEXIST;
...@@ -1468,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ...@@ -1468,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
} }
device->fs_devices = root->fs_info->fs_devices; device->fs_devices = root->fs_info->fs_devices;
/*
* we don't want write_supers to jump in here with our device
* half setup
*/
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_add(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list, list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list); &root->fs_info->fs_devices->alloc_list);
...@@ -1486,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ...@@ -1486,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
btrfs_set_super_num_devices(&root->fs_info->super_copy, btrfs_set_super_num_devices(&root->fs_info->super_copy,
total_bytes + 1); total_bytes + 1);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) { if (seeding_dev) {
ret = init_first_rw_device(trans, root, device); ret = init_first_rw_device(trans, root, device);
......
...@@ -96,7 +96,12 @@ struct btrfs_fs_devices { ...@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
u64 rw_devices; u64 rw_devices;
u64 total_rw_bytes; u64 total_rw_bytes;
struct block_device *latest_bdev; struct block_device *latest_bdev;
/* all of the devices in the FS */
/* all of the devices in the FS, protected by a mutex
* so we can safely walk it to write out the supers without
* worrying about add/remove by the multi-device code
*/
struct mutex device_list_mutex;
struct list_head devices; struct list_head devices;
/* devices not currently being allocated */ /* devices not currently being allocated */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment