Commit 77ea887e authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

implement in-kernel gendisk events handling

Currently, media presence polling for removeable block devices is done
from userland.  There are several issues with this.

* Polling is done by periodically opening the device.  For SCSI
  devices, the command sequence generated by such action involves a
  few different commands including TEST_UNIT_READY.  This behavior,
  while perfectly legal, is different from Windows which only issues
  single command, GET_EVENT_STATUS_NOTIFICATION.  Unfortunately, some
  ATAPI devices lock up after being periodically queried such command
  sequences.

* There is no reliable and unintrusive way for a userland program to
  tell whether the target device is safe for media presence polling.
  For example, polling for media presence during an on-going burning
  session can make it fail.  The polling program can avoid this by
  opening the device with O_EXCL but then it risks making a valid
  exclusive user of the device fail w/ -EBUSY.

* Userland polling is unnecessarily heavy and in-kernel implementation
  is lighter and better coordinated (workqueue, timer slack).

This patch implements framework for in-kernel disk event handling,
which includes media presence polling.

* bdops->check_events() is added, which supercedes ->media_changed().
  It should check whether there's any pending event and return if so.
  Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
  DISK_EVENT_EJECT_REQUEST.  ->check_events() is guaranteed not to be
  called parallelly.

* gendisk->events and ->async_events are added.  These should be
  initialized by block driver before passing the device to add_disk().
  The former contains the mask of all supported events and the latter
  the mask of all events which the device can report without polling.
  /sys/block/*/events[_async] export these to userland.

* Kernel parameter block.events_dfl_poll_msecs controls the system
  polling interval (default is 0 which means disable) and
  /sys/block/*/events_poll_msecs control polling intervals for
  individual devices (default is -1 meaning use system setting).  Note
  that if a device can report all supported events asynchronously and
  its polling interval isn't explicitly set, the device won't be
  polled regardless of the system polling interval.

* If a device is opened exclusively with write access, event checking
  is automatically disabled until all write exclusive accesses are
  released.

* There are event 'clearing' events.  For example, both of currently
  defined events are cleared after the device has been successfully
  opened.  This information is passed to ->check_events() callback
  using @clearing argument as a hint.

* Event checking is always performed from system_nrt_wq and timer
  slack is set to 25% for polling.

* Nothing changes for drivers which implement ->media_changed() but
  not ->check_events().  Going forward, all drivers will be converted
  to ->check_events() and ->media_change() will be dropped.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: default avatarJens Axboe <jaxboe@fusionio.com>
parent d2bf1b67
This diff is collapsed.
......@@ -948,10 +948,11 @@ int check_disk_change(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
const struct block_device_operations *bdops = disk->fops;
unsigned int events;
if (!bdops->media_changed)
return 0;
if (!bdops->media_changed(bdev->bd_disk))
events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
DISK_EVENT_EJECT_REQUEST);
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return 0;
flush_disk(bdev);
......@@ -1158,9 +1159,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
if (whole) {
/* finish claiming */
mutex_lock(&bdev->bd_mutex);
spin_lock(&bdev_lock);
if (res == 0) {
if (!res) {
BUG_ON(!bd_may_claim(bdev, whole, holder));
/*
* Note that for a whole device bd_holders
......@@ -1180,6 +1182,20 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
wake_up_bit(&whole->bd_claiming, 0);
spin_unlock(&bdev_lock);
/*
* Block event polling for write claims. Any write
* holder makes the write_holder state stick until all
* are released. This is good enough and tracking
* individual writeable reference is too fragile given
* the way @mode is used in blkdev_get/put().
*/
if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
bdev->bd_write_holder = true;
disk_block_events(bdev->bd_disk);
}
mutex_unlock(&bdev->bd_mutex);
bdput(whole);
}
......@@ -1353,12 +1369,23 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
spin_unlock(&bdev_lock);
/* if this was the last claim, holder link should go too */
if (bdev_free)
/*
* If this was the last claim, remove holder link and
* unblock evpoll if it was a write holder.
*/
if (bdev_free) {
bd_unlink_disk_holder(bdev);
if (bdev->bd_write_holder) {
disk_unblock_events(bdev->bd_disk);
bdev->bd_write_holder = false;
} else
disk_check_events(bdev->bd_disk);
}
mutex_unlock(&bdev->bd_mutex);
}
} else
disk_check_events(bdev->bd_disk);
return __blkdev_put(bdev, mode, 0);
}
EXPORT_SYMBOL(blkdev_put);
......
......@@ -1251,6 +1251,9 @@ struct block_device_operations {
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*direct_access) (struct block_device *, sector_t,
void **, unsigned long *);
unsigned int (*check_events) (struct gendisk *disk,
unsigned int clearing);
/* ->media_changed() is DEPRECATED, use ->check_events() instead */
int (*media_changed) (struct gendisk *);
void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
......
......@@ -662,6 +662,7 @@ struct block_device {
void * bd_claiming;
void * bd_holder;
int bd_holders;
bool bd_write_holder;
#ifdef CONFIG_SYSFS
struct gendisk * bd_holder_disk; /* for sysfs slave linkng */
#endif
......
......@@ -127,6 +127,11 @@ struct hd_struct {
#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
#define GENHD_FL_NATIVE_CAPACITY 128
enum {
DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
};
#define BLK_SCSI_MAX_CMDS (256)
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
......@@ -143,6 +148,8 @@ struct disk_part_tbl {
struct hd_struct __rcu *part[];
};
struct disk_events;
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
......@@ -154,6 +161,10 @@ struct gendisk {
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, mode_t *mode);
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
......@@ -171,8 +182,8 @@ struct gendisk {
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
......@@ -405,6 +416,11 @@ static inline int get_disk_ro(struct gendisk *disk)
return disk->part0.policy;
}
extern void disk_block_events(struct gendisk *disk);
extern void disk_unblock_events(struct gendisk *disk);
extern void disk_check_events(struct gendisk *disk);
extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk);
extern void rand_initialize_disk(struct gendisk *disk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment