Commit 5af3e8cc authored by Stefan Behrens's avatar Stefan Behrens Committed by Chris Mason

Btrfs: make filesystem read-only when submitting barrier fails

So far the return code of barrier_all_devices() is ignored, which
means that errors are ignored. The result can be a corrupt
filesystem which is not consistent.
This commit adds code to evaluate the return code of
barrier_all_devices(). The normal btrfs_error() mechanism is used to
switch the filesystem into read-only mode when errors are detected.

In order to decide whether barrier_all_devices() should return
error or success, the number of disks that are allowed to fail the
barrier submission is calculated. This calculation accounts for the
worst RAID level of metadata, system and data. If single, dup or
RAID0 is in use, a single disk error is already considered to be
fatal. Otherwise a single disk error is tolerated.

The calculation of the number of disks that are tolerated to fail
the barrier operation is performed when the filesystem gets mounted,
when a balance operation is started and finished, and when devices
are added or removed.
Signed-off-by: default avatarStefan Behrens <sbehrens@giantdisaster.de>
parent 62856a9b
...@@ -1468,6 +1468,8 @@ struct btrfs_fs_info { ...@@ -1468,6 +1468,8 @@ struct btrfs_fs_info {
/* next backup root to be overwritten */ /* next backup root to be overwritten */
int backup_root_index; int backup_root_index;
int num_tolerated_disk_barrier_failures;
}; };
/* /*
...@@ -3361,6 +3363,9 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); ...@@ -3361,6 +3363,9 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
int btrfs_defrag_file(struct inode *inode, struct file *file, int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range, struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages); u64 newer_than, unsigned long max_pages);
void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
/* file.c */ /* file.c */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct inode *inode); struct inode *inode);
......
...@@ -2505,6 +2505,8 @@ int open_ctree(struct super_block *sb, ...@@ -2505,6 +2505,8 @@ int open_ctree(struct super_block *sb,
printk(KERN_ERR "Failed to read block groups: %d\n", ret); printk(KERN_ERR "Failed to read block groups: %d\n", ret);
goto fail_block_groups; goto fail_block_groups;
} }
fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner"); "btrfs-cleaner");
...@@ -2888,12 +2890,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait) ...@@ -2888,12 +2890,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
printk_in_rcu("btrfs: disabling barriers on dev %s\n", printk_in_rcu("btrfs: disabling barriers on dev %s\n",
rcu_str_deref(device->name)); rcu_str_deref(device->name));
device->nobarriers = 1; device->nobarriers = 1;
} } else if (!bio_flagged(bio, BIO_UPTODATE)) {
if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO; ret = -EIO;
if (!bio_flagged(bio, BIO_EOPNOTSUPP)) btrfs_dev_stat_inc_and_print(device,
btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
BTRFS_DEV_STAT_FLUSH_ERRS);
} }
/* drop the reference from the wait == 0 run */ /* drop the reference from the wait == 0 run */
...@@ -2932,14 +2932,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ...@@ -2932,14 +2932,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
{ {
struct list_head *head; struct list_head *head;
struct btrfs_device *dev; struct btrfs_device *dev;
int errors = 0; int errors_send = 0;
int errors_wait = 0;
int ret; int ret;
/* send down all the barriers */ /* send down all the barriers */
head = &info->fs_devices->devices; head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) { list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) { if (!dev->bdev) {
errors++; errors_send++;
continue; continue;
} }
if (!dev->in_fs_metadata || !dev->writeable) if (!dev->in_fs_metadata || !dev->writeable)
...@@ -2947,13 +2948,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ...@@ -2947,13 +2948,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
ret = write_dev_flush(dev, 0); ret = write_dev_flush(dev, 0);
if (ret) if (ret)
errors++; errors_send++;
} }
/* wait for all the barriers */ /* wait for all the barriers */
list_for_each_entry_rcu(dev, head, dev_list) { list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) { if (!dev->bdev) {
errors++; errors_wait++;
continue; continue;
} }
if (!dev->in_fs_metadata || !dev->writeable) if (!dev->in_fs_metadata || !dev->writeable)
...@@ -2961,13 +2962,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info) ...@@ -2961,13 +2962,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
ret = write_dev_flush(dev, 1); ret = write_dev_flush(dev, 1);
if (ret) if (ret)
errors++; errors_wait++;
} }
if (errors) if (errors_send > info->num_tolerated_disk_barrier_failures ||
errors_wait > info->num_tolerated_disk_barrier_failures)
return -EIO; return -EIO;
return 0; return 0;
} }
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info)
{
struct btrfs_ioctl_space_info space;
struct btrfs_space_info *sinfo;
u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
BTRFS_BLOCK_GROUP_SYSTEM,
BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
int num_types = 4;
int i;
int c;
int num_tolerated_disk_barrier_failures =
(int)fs_info->fs_devices->num_devices;
for (i = 0; i < num_types; i++) {
struct btrfs_space_info *tmp;
sinfo = NULL;
rcu_read_lock();
list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
sinfo = tmp;
break;
}
}
rcu_read_unlock();
if (!sinfo)
continue;
down_read(&sinfo->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
if (!list_empty(&sinfo->block_groups[c])) {
u64 flags;
btrfs_get_block_group_info(
&sinfo->block_groups[c], &space);
if (space.total_bytes == 0 ||
space.used_bytes == 0)
continue;
flags = space.flags;
/*
* return
* 0: if dup, single or RAID0 is configured for
* any of metadata, system or data, else
* 1: if RAID5 is configured, or if RAID1 or
* RAID10 is configured and only two mirrors
* are used, else
* 2: if RAID6 is configured, else
* num_mirrors - 1: if RAID1 or RAID10 is
* configured and more than
* 2 mirrors are used.
*/
if (num_tolerated_disk_barrier_failures > 0 &&
((flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID0)) ||
((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
== 0)))
num_tolerated_disk_barrier_failures = 0;
else if (num_tolerated_disk_barrier_failures > 1
&&
(flags & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10)))
num_tolerated_disk_barrier_failures = 1;
}
}
up_read(&sinfo->groups_sem);
}
return num_tolerated_disk_barrier_failures;
}
int write_all_supers(struct btrfs_root *root, int max_mirrors) int write_all_supers(struct btrfs_root *root, int max_mirrors)
{ {
struct list_head *head; struct list_head *head;
...@@ -2990,8 +3065,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) ...@@ -2990,8 +3065,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex); mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices; head = &root->fs_info->fs_devices->devices;
if (do_barriers) if (do_barriers) {
barrier_all_devices(root->fs_info); ret = barrier_all_devices(root->fs_info);
if (ret) {
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
btrfs_error(root->fs_info, ret,
"errors while submitting device barriers.");
return ret;
}
}
list_for_each_entry_rcu(dev, head, dev_list) { list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) { if (!dev->bdev) {
......
...@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, ...@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid); u64 objectid);
int btree_lock_page_hook(struct page *page, void *data, int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *)); void (*flush_fn)(void *));
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info);
#ifdef CONFIG_DEBUG_LOCK_ALLOC #ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void); void btrfs_init_lockdep(void);
......
...@@ -2875,8 +2875,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ...@@ -2875,8 +2875,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
return 0; return 0;
} }
static void get_block_group_info(struct list_head *groups_list, void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space) struct btrfs_ioctl_space_info *space)
{ {
struct btrfs_block_group_cache *block_group; struct btrfs_block_group_cache *block_group;
...@@ -2984,8 +2984,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) ...@@ -2984,8 +2984,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
down_read(&info->groups_sem); down_read(&info->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
if (!list_empty(&info->block_groups[c])) { if (!list_empty(&info->block_groups[c])) {
get_block_group_info(&info->block_groups[c], btrfs_get_block_group_info(
&space); &info->block_groups[c], &space);
memcpy(dest, &space, sizeof(space)); memcpy(dest, &space, sizeof(space));
dest++; dest++;
space_args.total_spaces++; space_args.total_spaces++;
......
...@@ -2425,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ...@@ -2425,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* in and cause problems either. * in and cause problems either.
*/ */
btrfs_scrub_pause_super(root); btrfs_scrub_pause_super(root);
write_ctree_super(trans, root->fs_info->tree_root, 1); ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
btrfs_scrub_continue_super(root); btrfs_scrub_continue_super(root);
ret = 0; if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out_wake_log_root;
}
mutex_lock(&root->log_mutex); mutex_lock(&root->log_mutex);
if (root->last_log_commit < log_transid) if (root->last_log_commit < log_transid)
......
...@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ...@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
free_fs_devices(cur_devices); free_fs_devices(cur_devices);
} }
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
/* /*
* at this point, the device is zero sized. We want to * at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super * remove it from the devices list and zero out the old super
...@@ -1799,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ...@@ -1799,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
btrfs_clear_space_info_full(root->fs_info); btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root); unlock_chunks(root);
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
ret = btrfs_commit_transaction(trans, root); ret = btrfs_commit_transaction(trans, root);
if (seeding_dev) { if (seeding_dev) {
...@@ -2809,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl, ...@@ -2809,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
} }
} }
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
int num_tolerated_disk_barrier_failures;
u64 target = bctl->sys.target;
num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
if (num_tolerated_disk_barrier_failures > 0 &&
(target &
(BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
num_tolerated_disk_barrier_failures = 0;
else if (num_tolerated_disk_barrier_failures > 1 &&
(target &
(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
num_tolerated_disk_barrier_failures = 1;
fs_info->num_tolerated_disk_barrier_failures =
num_tolerated_disk_barrier_failures;
}
ret = insert_balance_item(fs_info->tree_root, bctl); ret = insert_balance_item(fs_info->tree_root, bctl);
if (ret && ret != -EEXIST) if (ret && ret != -EEXIST)
goto out; goto out;
...@@ -2841,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, ...@@ -2841,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
__cancel_balance(fs_info); __cancel_balance(fs_info);
} }
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
}
wake_up(&fs_info->balance_wait_q); wake_up(&fs_info->balance_wait_q);
return ret; return ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment