Commit d3374825 authored by NeilBrown's avatar NeilBrown

md: make devices disappear when they are no longer needed.

Currently md devices, once created, never disappear until the module
is unloaded.  This is essentially because the gendisk holds a
reference to the mddev, and the mddev holds a reference to the
gendisk, this a circular reference.

If we drop the reference from mddev to gendisk, then we need to ensure
that the mddev is destroyed when the gendisk is destroyed.  However it
is not possible to hook into the gendisk destruction process to enable
this.

So we drop the reference from the gendisk to the mddev and destroy the
gendisk when the mddev gets destroyed.  However this has a
complication.
Between the call
   __blkdev_get->get_gendisk->kobj_lookup->md_probe
and the call
   __blkdev_get->md_open

there is no obvious way to hold a reference on the mddev any more, so
unless something is done, it will disappear and gendisk will be
destroyed prematurely.

Also, once we decide to destroy the mddev, there will be an unlockable
moment before the gendisk is unlinked (blk_unregister_region) during
which a new reference to the gendisk can be created.  We need to
ensure that this reference can not be used.  i.e. the ->open must
fail.

So:
 1/  in md_probe we set a flag in the mddev (hold_active) which
     indicates that the array should be treated as active, even
     though there are no references, and no appearance of activity.
     This is cleared by md_release when the device is closed if it
     is no longer needed.
     This ensures that the gendisk will survive between md_probe and
     md_open.

 2/  In md_open we check if the mddev we expect to open matches
     the gendisk that we did open.
     If there is a mismatch we return -ERESTARTSYS and modify
     __blkdev_get to retry from the top in that case.
     In the -ERESTARTSYS sys case we make sure to wait until
     the old gendisk (that we succeeded in opening) is really gone so
     we loop at most once.

Some udev configurations will always open an md device when it first
appears.   If we allow an md device that was just created by an open
to disappear on an immediate close, then this can race with such udev
configurations and result in an infinite loop the device being opened
and closed, then re-open due to the 'ADD' even from the first open,
and then close and so on.
So we make sure an md device, once created by an open, remains active
at least until some md 'ioctl' has been made on it.  This means that
all normal usage of md devices will allow them to disappear promptly
when not needed, but the worst that an incorrect usage will do it
cause an inactive md device to be left in existence (it can easily be
removed).

As an array can be stopped by writing to a sysfs attribute
  echo clear > /sys/block/mdXXX/md/array_state
we need to use scheduled work for deleting the gendisk and other
kobjects.  This allows us to wait for any pending gendisk deletion to
complete by simply calling flush_scheduled_work().
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
parent a21d1504
...@@ -214,15 +214,32 @@ static inline mddev_t *mddev_get(mddev_t *mddev) ...@@ -214,15 +214,32 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
return mddev; return mddev;
} }
static void mddev_delayed_delete(struct work_struct *ws)
{
mddev_t *mddev = container_of(ws, mddev_t, del_work);
kobject_del(&mddev->kobj);
kobject_put(&mddev->kobj);
}
static void mddev_put(mddev_t *mddev) static void mddev_put(mddev_t *mddev)
{ {
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return; return;
if (!mddev->raid_disks && list_empty(&mddev->disks)) { if (!mddev->raid_disks && list_empty(&mddev->disks) &&
!mddev->hold_active) {
list_del(&mddev->all_mddevs); list_del(&mddev->all_mddevs);
spin_unlock(&all_mddevs_lock); if (mddev->gendisk) {
kobject_put(&mddev->kobj); /* we did a probe so need to clean up.
* Call schedule_work inside the spinlock
* so that flush_scheduled_work() after
* mddev_find will succeed in waiting for the
* work to be done.
*/
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
schedule_work(&mddev->del_work);
} else } else
kfree(mddev);
}
spin_unlock(&all_mddevs_lock); spin_unlock(&all_mddevs_lock);
} }
...@@ -242,6 +259,7 @@ static mddev_t * mddev_find(dev_t unit) ...@@ -242,6 +259,7 @@ static mddev_t * mddev_find(dev_t unit)
if (new) { if (new) {
list_add(&new->all_mddevs, &all_mddevs); list_add(&new->all_mddevs, &all_mddevs);
mddev->hold_active = UNTIL_IOCTL;
spin_unlock(&all_mddevs_lock); spin_unlock(&all_mddevs_lock);
return new; return new;
} }
...@@ -3435,6 +3453,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, ...@@ -3435,6 +3453,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return -EACCES; return -EACCES;
rv = mddev_lock(mddev); rv = mddev_lock(mddev);
if (mddev->hold_active == UNTIL_IOCTL)
mddev->hold_active = 0;
if (!rv) { if (!rv) {
rv = entry->store(mddev, page, length); rv = entry->store(mddev, page, length);
mddev_unlock(mddev); mddev_unlock(mddev);
...@@ -3484,6 +3504,11 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) ...@@ -3484,6 +3504,11 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
if (!mddev) if (!mddev)
return NULL; return NULL;
/* wait for any previous instance if this device
* to be completed removed (mddev_delayed_delete).
*/
flush_scheduled_work();
mutex_lock(&disks_mutex); mutex_lock(&disks_mutex);
if (mddev->gendisk) { if (mddev->gendisk) {
mutex_unlock(&disks_mutex); mutex_unlock(&disks_mutex);
...@@ -3520,7 +3545,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) ...@@ -3520,7 +3545,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
disk->private_data = mddev; disk->private_data = mddev;
disk->queue = mddev->queue; disk->queue = mddev->queue;
/* Allow extended partitions. This makes the /* Allow extended partitions. This makes the
* 'mdp' device redundant, but we can really * 'mdp' device redundant, but we can't really
* remove it now. * remove it now.
*/ */
disk->flags |= GENHD_FL_EXT_DEVT; disk->flags |= GENHD_FL_EXT_DEVT;
...@@ -3536,6 +3561,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) ...@@ -3536,6 +3561,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
kobject_uevent(&mddev->kobj, KOBJ_ADD); kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
} }
mddev_put(mddev);
return NULL; return NULL;
} }
...@@ -5054,6 +5080,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, ...@@ -5054,6 +5080,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
done_unlock: done_unlock:
abort_unlock: abort_unlock:
if (mddev->hold_active == UNTIL_IOCTL &&
err != -EINVAL)
mddev->hold_active = 0;
mddev_unlock(mddev); mddev_unlock(mddev);
return err; return err;
...@@ -5070,14 +5099,25 @@ static int md_open(struct block_device *bdev, fmode_t mode) ...@@ -5070,14 +5099,25 @@ static int md_open(struct block_device *bdev, fmode_t mode)
* Succeed if we can lock the mddev, which confirms that * Succeed if we can lock the mddev, which confirms that
* it isn't being stopped right now. * it isn't being stopped right now.
*/ */
mddev_t *mddev = bdev->bd_disk->private_data; mddev_t *mddev = mddev_find(bdev->bd_dev);
int err; int err;
if (mddev->gendisk != bdev->bd_disk) {
/* we are racing with mddev_put which is discarding this
* bd_disk.
*/
mddev_put(mddev);
/* Wait until bdev->bd_disk is definitely gone */
flush_scheduled_work();
/* Then retry the open from the top */
return -ERESTARTSYS;
}
BUG_ON(mddev != bdev->bd_disk->private_data);
if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
goto out; goto out;
err = 0; err = 0;
mddev_get(mddev);
atomic_inc(&mddev->openers); atomic_inc(&mddev->openers);
mddev_unlock(mddev); mddev_unlock(mddev);
...@@ -6436,11 +6476,8 @@ static __exit void md_exit(void) ...@@ -6436,11 +6476,8 @@ static __exit void md_exit(void)
unregister_sysctl_table(raid_table_header); unregister_sysctl_table(raid_table_header);
remove_proc_entry("mdstat", NULL); remove_proc_entry("mdstat", NULL);
for_each_mddev(mddev, tmp) { for_each_mddev(mddev, tmp) {
struct gendisk *disk = mddev->gendisk;
if (!disk)
continue;
export_array(mddev); export_array(mddev);
mddev_put(mddev); mddev->hold_active = 0;
} }
} }
......
...@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) ...@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
} }
lock_kernel(); lock_kernel();
restart:
ret = -ENXIO; ret = -ENXIO;
disk = get_gendisk(bdev->bd_dev, &partno); disk = get_gendisk(bdev->bd_dev, &partno);
...@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) ...@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (disk->fops->open) { if (disk->fops->open) {
ret = disk->fops->open(bdev, mode); ret = disk->fops->open(bdev, mode);
if (ret == -ERESTARTSYS) {
/* Lost a race with 'disk' being
* deleted, try again.
* See md.c
*/
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
module_put(disk->fops->owner);
put_disk(disk);
bdev->bd_disk = NULL;
mutex_unlock(&bdev->bd_mutex);
goto restart;
}
if (ret) if (ret)
goto out_clear; goto out_clear;
} }
......
...@@ -137,6 +137,8 @@ struct mddev_s ...@@ -137,6 +137,8 @@ struct mddev_s
struct gendisk *gendisk; struct gendisk *gendisk;
struct kobject kobj; struct kobject kobj;
int hold_active;
#define UNTIL_IOCTL 1
/* Superblock information */ /* Superblock information */
int major_version, int major_version,
...@@ -246,6 +248,8 @@ struct mddev_s ...@@ -246,6 +248,8 @@ struct mddev_s
*/ */
struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */
struct work_struct del_work; /* used for delayed sysfs removal */
spinlock_t write_lock; spinlock_t write_lock;
wait_queue_head_t sb_wait; /* for waiting on superblock updates */ wait_queue_head_t sb_wait; /* for waiting on superblock updates */
atomic_t pending_writes; /* number of active superblock writes */ atomic_t pending_writes; /* number of active superblock writes */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment