Commit b87570f5 authored by Mikulas Patocka's avatar Mikulas Patocka Committed by Jens Axboe

Fix a crash when block device is read and block size is changed at the same time

The kernel may crash when block size is changed and I/O is issued
simultaneously.

Because some subsystems (udev or lvm) may read any block device anytime,
the bug actually puts any code that changes a block device size in
jeopardy.

The crash can be reproduced if you place "msleep(1000)" to
blkdev_get_blocks just before "bh->b_size = max_blocks <<
inode->i_blkbits;".
Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct"
While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0"
You get a BUG.

The direct and non-direct I/O is written with the assumption that block
size does not change. It doesn't seem practical to fix these crashes
one-by-one there may be many crash possibilities when block size changes
at a certain place and it is impossible to find them all and verify the
code.

This patch introduces a new rw-lock bd_block_size_semaphore. The lock is
taken for read during I/O. It is taken for write when changing block
size. Consequently, block size can't be changed while I/O is being
submitted.

For asynchronous I/O, the patch only prevents block size change while
the I/O is being submitted. The block size can change when the I/O is in
progress or when the I/O is being finished. This is acceptable because
there are no accesses to block size when asynchronous I/O is being
finished.

The patch prevents block size changing while the device is mapped with
mmap.
Signed-off-by: default avatarMikulas Patocka <mpatocka@redhat.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 60ea8226
...@@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd, ...@@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd,
static const struct file_operations raw_fops = { static const struct file_operations raw_fops = {
.read = do_sync_read, .read = do_sync_read,
.aio_read = generic_file_aio_read, .aio_read = blkdev_aio_read,
.write = do_sync_write, .write = do_sync_write,
.aio_write = blkdev_aio_write, .aio_write = blkdev_aio_write,
.fsync = blkdev_fsync, .fsync = blkdev_fsync,
......
...@@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev); ...@@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev);
int set_blocksize(struct block_device *bdev, int size) int set_blocksize(struct block_device *bdev, int size)
{ {
struct address_space *mapping;
/* Size must be a power of two, and between 512 and PAGE_SIZE */ /* Size must be a power of two, and between 512 and PAGE_SIZE */
if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
return -EINVAL; return -EINVAL;
...@@ -124,6 +126,20 @@ int set_blocksize(struct block_device *bdev, int size) ...@@ -124,6 +126,20 @@ int set_blocksize(struct block_device *bdev, int size)
if (size < bdev_logical_block_size(bdev)) if (size < bdev_logical_block_size(bdev))
return -EINVAL; return -EINVAL;
/* Prevent starting I/O or mapping the device */
down_write(&bdev->bd_block_size_semaphore);
/* Check that the block device is not memory mapped */
mapping = bdev->bd_inode->i_mapping;
mutex_lock(&mapping->i_mmap_mutex);
if (!prio_tree_empty(&mapping->i_mmap) ||
!list_empty(&mapping->i_mmap_nonlinear)) {
mutex_unlock(&mapping->i_mmap_mutex);
up_write(&bdev->bd_block_size_semaphore);
return -EBUSY;
}
mutex_unlock(&mapping->i_mmap_mutex);
/* Don't change the size if it is same as current */ /* Don't change the size if it is same as current */
if (bdev->bd_block_size != size) { if (bdev->bd_block_size != size) {
sync_blockdev(bdev); sync_blockdev(bdev);
...@@ -131,6 +147,9 @@ int set_blocksize(struct block_device *bdev, int size) ...@@ -131,6 +147,9 @@ int set_blocksize(struct block_device *bdev, int size)
bdev->bd_inode->i_blkbits = blksize_bits(size); bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev); kill_bdev(bdev);
} }
up_write(&bdev->bd_block_size_semaphore);
return 0; return 0;
} }
...@@ -472,6 +491,7 @@ static void init_once(void *foo) ...@@ -472,6 +491,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode); inode_init_once(&ei->vfs_inode);
/* Initialize mutex for freeze. */ /* Initialize mutex for freeze. */
mutex_init(&bdev->bd_fsfreeze_mutex); mutex_init(&bdev->bd_fsfreeze_mutex);
init_rwsem(&bdev->bd_block_size_semaphore);
} }
static inline void __bd_forget(struct inode *inode) static inline void __bd_forget(struct inode *inode)
...@@ -1567,6 +1587,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ...@@ -1567,6 +1587,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return blkdev_ioctl(bdev, mode, cmd, arg); return blkdev_ioctl(bdev, mode, cmd, arg);
} }
ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
ssize_t ret;
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
down_read(&bdev->bd_block_size_semaphore);
ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
up_read(&bdev->bd_block_size_semaphore);
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_aio_read);
/* /*
* Write data to the block device. Only intended for the block device itself * Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device. * and the raw driver which basically is a fake block device.
...@@ -1578,12 +1614,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, ...@@ -1578,12 +1614,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos) unsigned long nr_segs, loff_t pos)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct blk_plug plug; struct blk_plug plug;
ssize_t ret; ssize_t ret;
BUG_ON(iocb->ki_pos != pos); BUG_ON(iocb->ki_pos != pos);
blk_start_plug(&plug); blk_start_plug(&plug);
down_read(&bdev->bd_block_size_semaphore);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) { if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err; ssize_t err;
...@@ -1592,11 +1632,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, ...@@ -1592,11 +1632,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0 && ret > 0) if (err < 0 && ret > 0)
ret = err; ret = err;
} }
up_read(&bdev->bd_block_size_semaphore);
blk_finish_plug(&plug); blk_finish_plug(&plug);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(blkdev_aio_write); EXPORT_SYMBOL_GPL(blkdev_aio_write);
int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret;
struct block_device *bdev = I_BDEV(file->f_mapping->host);
down_read(&bdev->bd_block_size_semaphore);
ret = generic_file_mmap(file, vma);
up_read(&bdev->bd_block_size_semaphore);
return ret;
}
/* /*
* Try to release a page associated with block device when the system * Try to release a page associated with block device when the system
* is under memory pressure. * is under memory pressure.
...@@ -1627,9 +1685,9 @@ const struct file_operations def_blk_fops = { ...@@ -1627,9 +1685,9 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek, .llseek = block_llseek,
.read = do_sync_read, .read = do_sync_read,
.write = do_sync_write, .write = do_sync_write,
.aio_read = generic_file_aio_read, .aio_read = blkdev_aio_read,
.aio_write = blkdev_aio_write, .aio_write = blkdev_aio_write,
.mmap = generic_file_mmap, .mmap = blkdev_mmap,
.fsync = blkdev_fsync, .fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl, .unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
......
...@@ -725,6 +725,8 @@ struct block_device { ...@@ -725,6 +725,8 @@ struct block_device {
int bd_fsfreeze_count; int bd_fsfreeze_count;
/* Mutex for freeze */ /* Mutex for freeze */
struct mutex bd_fsfreeze_mutex; struct mutex bd_fsfreeze_mutex;
/* A semaphore that prevents I/O while block size is being changed */
struct rw_semaphore bd_block_size_semaphore;
}; };
/* /*
...@@ -2565,6 +2567,8 @@ extern int generic_segment_checks(const struct iovec *iov, ...@@ -2565,6 +2567,8 @@ extern int generic_segment_checks(const struct iovec *iov,
unsigned long *nr_segs, size_t *count, int access_flags); unsigned long *nr_segs, size_t *count, int access_flags);
/* fs/block_dev.c */ /* fs/block_dev.c */
extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos); unsigned long nr_segs, loff_t pos);
extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment