Commit 0f9a7517 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'block-6.10-20240530' of git://git.kernel.dk/linux

Pull block fixes from Jens Axboe:

 - NVMe fixes via Keith:
      - Removing unused fields (Kanchan)
      - Large folio offsets support (Kundan)
      - Multipath NUMA node initialiazation fix (Nilay)
      - Multipath IO stats accounting fixes (Keith)
      - Circular lockdep fix (Keith)
      - Target race condition fix (Sagi)
      - Target memory leak fix (Sagi)

 - bcache fixes

 - null_blk fixes (Damien)

 - Fix regression in io.max due to throttle low removal (Waiman)

 - DM limit table fixes (Christoph)

 - SCSI and block limit fixes (Christoph)

 - zone fixes (Damien)

 - Misc fixes (Christoph, Hannes, hexue)

* tag 'block-6.10-20240530' of git://git.kernel.dk/linux: (25 commits)
  blk-throttle: Fix incorrect display of io.max
  block: Fix zone write plugging handling of devices with a runt zone
  block: Fix validation of zoned device with a runt zone
  null_blk: Do not allow runt zone with zone capacity smaller then zone size
  nvmet: fix a possible leak when destroy a ctrl during qp establishment
  nvme: use srcu for iterating namespace list
  bcache: code cleanup in __bch_bucket_alloc_set()
  bcache: call force_wake_up_gc() if necessary in check_should_bypass()
  bcache: allow allocator to invalidate bucket in gc
  block: check for max_hw_sectors underflow
  block: stack max_user_sectors
  sd: also set max_user_sectors when setting max_sectors
  null_blk: Print correct max open zones limit in null_init_zoned_dev()
  block: delete redundant function declaration
  null_blk: Fix return value of nullb_device_power_store()
  dm: make dm_set_zones_restrictions work on the queue limits
  dm: remove dm_check_zoned
  dm: move setting zoned_enabled to dm_table_set_restrictions
  block: remove blk_queue_max_integrity_segments
  nvme: adjust multiples of NVME_CTRL_PAGE_SIZE in offset
  ...
parents 6d541d66 0a751df4
......@@ -104,6 +104,7 @@ static int blk_validate_zoned_limits(struct queue_limits *lim)
static int blk_validate_limits(struct queue_limits *lim)
{
unsigned int max_hw_sectors;
unsigned int logical_block_sectors;
/*
* Unless otherwise specified, default to 512 byte logical blocks and a
......@@ -134,8 +135,11 @@ static int blk_validate_limits(struct queue_limits *lim)
lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS))
return -EINVAL;
logical_block_sectors = lim->logical_block_size >> SECTOR_SHIFT;
if (WARN_ON_ONCE(logical_block_sectors > lim->max_hw_sectors))
return -EINVAL;
lim->max_hw_sectors = round_down(lim->max_hw_sectors,
lim->logical_block_size >> SECTOR_SHIFT);
logical_block_sectors);
/*
* The actual max_sectors value is a complex beast and also takes the
......@@ -153,7 +157,7 @@ static int blk_validate_limits(struct queue_limits *lim)
lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
}
lim->max_sectors = round_down(lim->max_sectors,
lim->logical_block_size >> SECTOR_SHIFT);
logical_block_sectors);
/*
* Random default for the maximum number of segments. Driver should not
......@@ -611,6 +615,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
unsigned int top, bottom, alignment, ret = 0;
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_user_sectors = min_not_zero(t->max_user_sectors,
b->max_user_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
......
......@@ -64,7 +64,6 @@ struct blk_stat_callback {
struct blk_queue_stats *blk_alloc_queue_stats(void);
void blk_free_queue_stats(struct blk_queue_stats *);
bool blk_stats_alloc_enable(struct request_queue *q);
void blk_stat_add(struct request *rq, u64 now);
......
......@@ -1399,32 +1399,32 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
bps_dft = U64_MAX;
iops_dft = UINT_MAX;
if (tg->bps_conf[READ] == bps_dft &&
tg->bps_conf[WRITE] == bps_dft &&
tg->iops_conf[READ] == iops_dft &&
tg->iops_conf[WRITE] == iops_dft)
if (tg->bps[READ] == bps_dft &&
tg->bps[WRITE] == bps_dft &&
tg->iops[READ] == iops_dft &&
tg->iops[WRITE] == iops_dft)
return 0;
seq_printf(sf, "%s", dname);
if (tg->bps_conf[READ] == U64_MAX)
if (tg->bps[READ] == U64_MAX)
seq_printf(sf, " rbps=max");
else
seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]);
seq_printf(sf, " rbps=%llu", tg->bps[READ]);
if (tg->bps_conf[WRITE] == U64_MAX)
if (tg->bps[WRITE] == U64_MAX)
seq_printf(sf, " wbps=max");
else
seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]);
seq_printf(sf, " wbps=%llu", tg->bps[WRITE]);
if (tg->iops_conf[READ] == UINT_MAX)
if (tg->iops[READ] == UINT_MAX)
seq_printf(sf, " riops=max");
else
seq_printf(sf, " riops=%u", tg->iops_conf[READ]);
seq_printf(sf, " riops=%u", tg->iops[READ]);
if (tg->iops_conf[WRITE] == UINT_MAX)
if (tg->iops[WRITE] == UINT_MAX)
seq_printf(sf, " wiops=max");
else
seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]);
seq_printf(sf, " wiops=%u", tg->iops[WRITE]);
seq_printf(sf, "\n");
return 0;
......
......@@ -95,15 +95,11 @@ struct throtl_grp {
bool has_rules_bps[2];
bool has_rules_iops[2];
/* internally used bytes per second rate limits */
/* bytes per second rate limits */
uint64_t bps[2];
/* user configured bps limits */
uint64_t bps_conf[2];
/* internally used IOPS limits */
/* IOPS limits */
unsigned int iops[2];
/* user configured IOPS limits */
unsigned int iops_conf[2];
/* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
......
......@@ -450,6 +450,25 @@ static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
}
static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
{
return zone->start + zone->len >= get_capacity(disk);
}
static bool disk_zone_is_full(struct gendisk *disk,
unsigned int zno, unsigned int offset_in_zone)
{
if (zno < disk->nr_zones - 1)
return offset_in_zone >= disk->zone_capacity;
return offset_in_zone >= disk->last_zone_capacity;
}
static bool disk_zone_wplug_is_full(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset);
}
static bool disk_insert_zone_wplug(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
......@@ -543,7 +562,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
return false;
/* We can remove zone write plugs for zones that are empty or full. */
return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity;
return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug);
}
static void disk_remove_zone_wplug(struct gendisk *disk,
......@@ -664,13 +683,12 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
unsigned int zone_capacity = disk->zone_capacity;
unsigned int wp_offset = zwplug->wp_offset;
struct bio_list bl = BIO_EMPTY_LIST;
struct bio *bio;
while ((bio = bio_list_pop(&zwplug->bio_list))) {
if (wp_offset >= zone_capacity ||
if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) ||
(bio_op(bio) != REQ_OP_ZONE_APPEND &&
bio_offset_from_zone_start(bio) != wp_offset)) {
blk_zone_wplug_bio_io_error(zwplug, bio);
......@@ -909,7 +927,6 @@ void blk_zone_write_plug_init_request(struct request *req)
sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
struct request_queue *q = req->q;
struct gendisk *disk = q->disk;
unsigned int zone_capacity = disk->zone_capacity;
struct blk_zone_wplug *zwplug =
disk_get_zone_wplug(disk, blk_rq_pos(req));
unsigned long flags;
......@@ -933,7 +950,7 @@ void blk_zone_write_plug_init_request(struct request *req)
* into the back of the request.
*/
spin_lock_irqsave(&zwplug->lock, flags);
while (zwplug->wp_offset < zone_capacity) {
while (!disk_zone_wplug_is_full(disk, zwplug)) {
bio = bio_list_peek(&zwplug->bio_list);
if (!bio)
break;
......@@ -979,7 +996,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
* We know such BIO will fail, and that would potentially overflow our
* write pointer offset beyond the end of the zone.
*/
if (zwplug->wp_offset >= disk->zone_capacity)
if (disk_zone_wplug_is_full(disk, zwplug))
goto err;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
......@@ -1556,6 +1573,7 @@ void disk_free_zone_resources(struct gendisk *disk)
kfree(disk->conv_zones_bitmap);
disk->conv_zones_bitmap = NULL;
disk->zone_capacity = 0;
disk->last_zone_capacity = 0;
disk->nr_zones = 0;
}
......@@ -1600,6 +1618,7 @@ struct blk_revalidate_zone_args {
unsigned long *conv_zones_bitmap;
unsigned int nr_zones;
unsigned int zone_capacity;
unsigned int last_zone_capacity;
sector_t sector;
};
......@@ -1617,6 +1636,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
disk->nr_zones = args->nr_zones;
disk->zone_capacity = args->zone_capacity;
disk->last_zone_capacity = args->last_zone_capacity;
swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
if (disk->conv_zones_bitmap)
nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap,
......@@ -1668,6 +1688,9 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
if (disk_zone_is_last(disk, zone))
args->last_zone_capacity = zone->capacity;
if (!disk_need_zone_resources(disk))
return 0;
......@@ -1693,11 +1716,14 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
/*
* Remember the capacity of the first sequential zone and check
* if it is constant for all zones.
* if it is constant for all zones, ignoring the last zone as it can be
* smaller.
*/
if (!args->zone_capacity)
args->zone_capacity = zone->capacity;
if (zone->capacity != args->zone_capacity) {
if (disk_zone_is_last(disk, zone)) {
args->last_zone_capacity = zone->capacity;
} else if (zone->capacity != args->zone_capacity) {
pr_warn("%s: Invalid variable zone capacity\n",
disk->disk_name);
return -ENODEV;
......@@ -1732,7 +1758,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
{
struct blk_revalidate_zone_args *args = data;
struct gendisk *disk = args->disk;
sector_t capacity = get_capacity(disk);
sector_t zone_sectors = disk->queue->limits.chunk_sectors;
int ret;
......@@ -1743,7 +1768,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
if (zone->start >= capacity || !zone->len) {
if (zone->start >= get_capacity(disk) || !zone->len) {
pr_warn("%s: Invalid zone start %llu, length %llu\n",
disk->disk_name, zone->start, zone->len);
return -ENODEV;
......@@ -1753,7 +1778,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
* All zones must have the same size, with the exception on an eventual
* smaller last zone.
*/
if (zone->start + zone->len < capacity) {
if (!disk_zone_is_last(disk, zone)) {
if (zone->len != zone_sectors) {
pr_warn("%s: Invalid zoned device with non constant zone size\n",
disk->disk_name);
......
......@@ -494,6 +494,7 @@ static ssize_t nullb_device_power_store(struct config_item *item,
set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
dev->power = newp;
ret = count;
} else if (dev->power && !newp) {
if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
dev->power = newp;
......
......@@ -74,6 +74,17 @@ int null_init_zoned_dev(struct nullb_device *dev,
return -EINVAL;
}
/*
* If a smaller zone capacity was requested, do not allow a smaller last
* zone at the same time as such zone configuration does not correspond
* to any real zoned device.
*/
if (dev->zone_capacity != dev->zone_size &&
dev->size & (dev->zone_size - 1)) {
pr_err("A smaller last zone is not allowed with zone capacity smaller than zone size.\n");
return -EINVAL;
}
zone_capacity_sects = mb_to_sects(dev->zone_capacity);
dev_capacity_sects = mb_to_sects(dev->size);
dev->zone_size_sects = mb_to_sects(dev->zone_size);
......@@ -108,7 +119,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) {
dev->zone_max_open = dev->zone_max_active;
pr_info("changed the maximum number of open zones to %u\n",
dev->nr_zones);
dev->zone_max_open);
} else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) {
dev->zone_max_open = 0;
pr_info("zone_max_open limit disabled, limit >= zone count\n");
......
......@@ -129,12 +129,9 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
{
BUG_ON(!ca->set->gc_mark_valid);
return (!GC_MARK(b) ||
GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
!atomic_read(&b->pin) &&
can_inc_bucket_gen(b);
return (ca->set->gc_mark_valid || b->reclaimable_in_gc) &&
((!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
!atomic_read(&b->pin) && can_inc_bucket_gen(b));
}
void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
......@@ -148,6 +145,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
bch_inc_gen(ca, b);
b->prio = INITIAL_PRIO;
atomic_inc(&b->pin);
b->reclaimable_in_gc = 0;
}
static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
......@@ -352,8 +350,7 @@ static int bch_allocator_thread(void *arg)
*/
retry_invalidate:
allocator_wait(ca, ca->set->gc_mark_valid &&
!ca->invalidate_needs_gc);
allocator_wait(ca, !ca->invalidate_needs_gc);
invalidate_buckets(ca);
/*
......@@ -501,8 +498,8 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
ca = c->cache;
b = bch_bucket_alloc(ca, reserve, wait);
if (b == -1)
goto err;
if (b < 0)
return -1;
k->ptr[0] = MAKE_PTR(ca->buckets[b].gen,
bucket_to_sector(c, b),
......@@ -511,10 +508,6 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
SET_KEY_PTRS(k, 1);
return 0;
err:
bch_bucket_free(c, k);
bkey_put(c, k);
return -1;
}
int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
......
......@@ -200,6 +200,7 @@ struct bucket {
uint8_t gen;
uint8_t last_gc; /* Most out of date gen in the btree */
uint16_t gc_mark; /* Bitfield used by GC. See below for field */
uint16_t reclaimable_in_gc:1;
};
/*
......
......@@ -1741,18 +1741,20 @@ static void btree_gc_start(struct cache_set *c)
mutex_lock(&c->bucket_lock);
c->gc_mark_valid = 0;
c->gc_done = ZERO_KEY;
ca = c->cache;
for_each_bucket(b, ca) {
b->last_gc = b->gen;
if (bch_can_invalidate_bucket(ca, b))
b->reclaimable_in_gc = 1;
if (!atomic_read(&b->pin)) {
SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0);
}
}
c->gc_mark_valid = 0;
mutex_unlock(&c->bucket_lock);
}
......@@ -1809,6 +1811,9 @@ static void bch_btree_gc_finish(struct cache_set *c)
for_each_bucket(b, ca) {
c->need_gc = max(c->need_gc, bucket_gc_gen(b));
if (b->reclaimable_in_gc)
b->reclaimable_in_gc = 0;
if (atomic_read(&b->pin))
continue;
......
......@@ -369,10 +369,24 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
struct io *i;
if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
(bio_op(bio) == REQ_OP_DISCARD))
goto skip;
if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) {
/*
* If cached buckets are all clean now, 'true' will be
* returned and all requests will bypass the cache device.
* Then c->sectors_to_gc has no chance to be negative, and
* gc thread won't wake up and caching won't work forever.
* Here call force_wake_up_gc() to avoid such aftermath.
*/
if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN &&
c->gc_mark_valid)
force_wake_up_gc(c);
goto skip;
}
if (mode == CACHE_MODE_NONE ||
(mode == CACHE_MODE_WRITEAROUND &&
op_is_write(bio_op(bio))))
......
......@@ -1981,10 +1981,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_secure_erase(t))
limits->max_secure_erase_sectors = 0;
r = queue_limits_set(q, limits);
if (r)
return r;
if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
wc = true;
if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA)))
......@@ -2036,15 +2032,16 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
* For a zoned target, setup the zones related queue attributes
* and resources necessary for zone append emulation if necessary.
*/
if (blk_queue_is_zoned(q)) {
r = dm_set_zones_restrictions(t, q);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) {
r = dm_set_zones_restrictions(t, q, limits);
if (r)
return r;
if (blk_queue_is_zoned(q) &&
!static_key_enabled(&zoned_enabled.key))
static_branch_enable(&zoned_enabled);
}
r = queue_limits_set(q, limits);
if (r)
return r;
dm_update_crypto_profile(q, t);
/*
......
......@@ -160,37 +160,6 @@ static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx,
return 0;
}
static int dm_check_zoned(struct mapped_device *md, struct dm_table *t)
{
struct gendisk *disk = md->disk;
unsigned int nr_conv_zones = 0;
int ret;
/* Count conventional zones */
md->zone_revalidate_map = t;
ret = dm_blk_report_zones(disk, 0, UINT_MAX,
dm_check_zoned_cb, &nr_conv_zones);
md->zone_revalidate_map = NULL;
if (ret < 0) {
DMERR("Check zoned failed %d", ret);
return ret;
}
/*
* If we only have conventional zones, expose the mapped device as
* a regular device.
*/
if (nr_conv_zones >= ret) {
disk->queue->limits.max_open_zones = 0;
disk->queue->limits.max_active_zones = 0;
disk->queue->limits.zoned = false;
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
disk->nr_zones = 0;
}
return 0;
}
/*
* Revalidate the zones of a mapped device to initialize resource necessary
* for zone append emulation. Note that we cannot simply use the block layer
......@@ -251,9 +220,12 @@ static bool dm_table_supports_zone_append(struct dm_table *t)
return true;
}
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *lim)
{
struct mapped_device *md = t->md;
struct gendisk *disk = md->disk;
unsigned int nr_conv_zones = 0;
int ret;
/*
......@@ -265,21 +237,37 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
} else {
set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
blk_queue_max_zone_append_sectors(q, 0);
lim->max_zone_append_sectors = 0;
}
if (!get_capacity(md->disk))
return 0;
/*
* Check that the mapped device will indeed be zoned, that is, that it
* has sequential write required zones.
* Count conventional zones to check that the mapped device will indeed
* have sequential write required zones.
*/
ret = dm_check_zoned(md, t);
if (ret)
md->zone_revalidate_map = t;
ret = dm_blk_report_zones(disk, 0, UINT_MAX,
dm_check_zoned_cb, &nr_conv_zones);
md->zone_revalidate_map = NULL;
if (ret < 0) {
DMERR("Check zoned failed %d", ret);
return ret;
if (!blk_queue_is_zoned(q))
}
/*
* If we only have conventional zones, expose the mapped device as
* a regular device.
*/
if (nr_conv_zones >= ret) {
lim->max_open_zones = 0;
lim->max_active_zones = 0;
lim->zoned = false;
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
disk->nr_zones = 0;
return 0;
}
if (!md->disk->nr_zones) {
DMINFO("%s using %s zone append",
......@@ -287,7 +275,13 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
queue_emulates_zone_append(q) ? "emulated" : "native");
}
return dm_revalidate_zones(md, t);
ret = dm_revalidate_zones(md, t);
if (ret < 0)
return ret;
if (!static_key_enabled(&zoned_enabled.key))
static_branch_enable(&zoned_enabled);
return 0;
}
/*
......
......@@ -101,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
/*
* Zoned targets related functions.
*/
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *lim);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
......
......@@ -414,7 +414,15 @@ static inline void nvme_end_req_zoned(struct request *req)
}
}
static inline void nvme_end_req(struct request *req)
static inline void __nvme_end_req(struct request *req)
{
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
if (req->cmd_flags & REQ_NVME_MPATH)
nvme_mpath_end_request(req);
}
void nvme_end_req(struct request *req)
{
blk_status_t status = nvme_error_status(nvme_req(req)->status);
......@@ -424,10 +432,7 @@ static inline void nvme_end_req(struct request *req)
else
nvme_log_error(req);
}
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
if (req->cmd_flags & REQ_NVME_MPATH)
nvme_mpath_end_request(req);
__nvme_end_req(req);
blk_mq_end_request(req, status);
}
......@@ -476,7 +481,7 @@ void nvme_complete_batch_req(struct request *req)
{
trace_nvme_complete_rq(req);
nvme_cleanup_cmd(req);
nvme_end_req_zoned(req);
__nvme_end_req(req);
}
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
......@@ -673,7 +678,7 @@ static void nvme_free_ns(struct kref *kref)
kfree(ns);
}
static inline bool nvme_get_ns(struct nvme_ns *ns)
bool nvme_get_ns(struct nvme_ns *ns)
{
return kref_get_unless_zero(&ns->kref);
}
......@@ -3679,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns, *ret = NULL;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
if (ns->head->ns_id == nsid) {
if (!nvme_get_ns(ns))
continue;
......@@ -3691,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (ns->head->ns_id > nsid)
break;
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return ret;
}
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
......@@ -3705,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
if (tmp->head->ns_id < ns->head->ns_id) {
list_add(&ns->list, &tmp->list);
list_add_rcu(&ns->list, &tmp->list);
return;
}
}
......@@ -3771,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (nvme_update_ns_info(ns, info))
goto out_unlink_ns;
down_write(&ctrl->namespaces_rwsem);
mutex_lock(&ctrl->namespaces_lock);
/*
* Ensure that no namespaces are added to the ctrl list after the queues
* are frozen, thereby avoiding a deadlock between scan and reset.
*/
if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
up_write(&ctrl->namespaces_rwsem);
mutex_unlock(&ctrl->namespaces_lock);
goto out_unlink_ns;
}
nvme_ns_add_to_ctrl_list(ns);
up_write(&ctrl->namespaces_rwsem);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
nvme_get_ctrl(ctrl);
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
......@@ -3804,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
out_cleanup_ns_from_list:
nvme_put_ctrl(ctrl);
down_write(&ctrl->namespaces_rwsem);
list_del_init(&ns->list);
up_write(&ctrl->namespaces_rwsem);
mutex_lock(&ctrl->namespaces_lock);
list_del_rcu(&ns->list);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
......@@ -3856,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
del_gendisk(ns->disk);
down_write(&ns->ctrl->namespaces_rwsem);
list_del_init(&ns->list);
up_write(&ns->ctrl->namespaces_rwsem);
mutex_lock(&ns->ctrl->namespaces_lock);
list_del_rcu(&ns->list);
mutex_unlock(&ns->ctrl->namespaces_lock);
synchronize_srcu(&ns->ctrl->srcu);
if (last_path)
nvme_mpath_shutdown_disk(ns->head);
......@@ -3948,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
struct nvme_ns *ns, *next;
LIST_HEAD(rm_list);
down_write(&ctrl->namespaces_rwsem);
mutex_lock(&ctrl->namespaces_lock);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
if (ns->head->ns_id > nsid)
list_move_tail(&ns->list, &rm_list);
list_splice_init_rcu(&ns->list, &rm_list,
synchronize_rcu);
}
up_write(&ctrl->namespaces_rwsem);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
list_for_each_entry_safe(ns, next, &rm_list, list)
nvme_ns_remove(ns);
}
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
......@@ -4127,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
down_write(&ctrl->namespaces_rwsem);
list_splice_init(&ctrl->namespaces, &ns_list);
up_write(&ctrl->namespaces_rwsem);
mutex_lock(&ctrl->namespaces_lock);
list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
list_for_each_entry_safe(ns, next, &ns_list, list)
nvme_ns_remove(ns);
......@@ -4577,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
key_put(ctrl->tls_key);
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
cleanup_srcu_struct(&ctrl->srcu);
nvme_auth_stop(ctrl);
nvme_auth_free(ctrl);
__free_page(ctrl->discard_page);
......@@ -4609,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->passthru_err_log_enabled = false;
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->namespaces_lock);
ret = init_srcu_struct(&ctrl->srcu);
if (ret)
return ret;
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
xa_init(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
......@@ -4692,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
out:
if (ctrl->discard_page)
__free_page(ctrl->discard_page);
cleanup_srcu_struct(&ctrl->srcu);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
......@@ -4700,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mark_disk_dead(ns->disk);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mq_unfreeze_queue(ns->queue);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
......@@ -4723,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
if (timeout <= 0)
break;
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return timeout;
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
......@@ -4738,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mq_freeze_queue_wait(ns->queue);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_freeze_queue_start(ns->queue);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
......@@ -4797,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_sync_queue(ns->queue);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
......
......@@ -789,15 +789,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
bool open_for_write)
{
struct nvme_ns *ns;
int ret;
int ret, srcu_idx;
down_read(&ctrl->namespaces_rwsem);
srcu_idx = srcu_read_lock(&ctrl->srcu);
if (list_empty(&ctrl->namespaces)) {
ret = -ENOTTY;
goto out_unlock;
}
ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
dev_warn(ctrl->device,
"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
......@@ -807,15 +807,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
dev_warn(ctrl->device,
"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
kref_get(&ns->kref);
up_read(&ctrl->namespaces_rwsem);
if (!nvme_get_ns(ns)) {
ret = -ENXIO;
goto out_unlock;
}
srcu_read_unlock(&ctrl->srcu, srcu_idx);
ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
nvme_put_ns(ns);
return ret;
out_unlock:
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return ret;
}
......
......@@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req)
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
nvme_req(req)->status = 0;
nvme_end_req(req);
kblockd_schedule_work(&ns->head->requeue_work);
}
......@@ -150,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
if (!ns->head->disk)
continue;
kblockd_schedule_work(&ns->head->requeue_work);
if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
disk_uevent(ns->head->disk, KOBJ_CHANGE);
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
static const char *nvme_ana_state_names[] = {
......@@ -193,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
nvme_mpath_clear_current_path(ns);
kblockd_schedule_work(&ns->head->requeue_work);
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
......@@ -595,7 +598,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
int node, srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
for_each_node(node)
for_each_online_node(node)
__nvme_find_path(head, node);
srcu_read_unlock(&head->srcu, srcu_idx);
}
......@@ -680,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
unsigned *nr_change_groups = data;
struct nvme_ns *ns;
int srcu_idx;
dev_dbg(ctrl->device, "ANA group %d: %s.\n",
le32_to_cpu(desc->grpid),
......@@ -691,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (!nr_nsids)
return 0;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
unsigned nsid;
again:
nsid = le32_to_cpu(desc->nsids[n]);
......@@ -705,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (ns->head->ns_id > nsid)
goto again;
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return 0;
}
......
......@@ -282,7 +282,8 @@ struct nvme_ctrl {
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
struct rw_semaphore namespaces_rwsem;
struct mutex namespaces_lock;
struct srcu_struct srcu;
struct device ctrl_device;
struct device *device; /* char device */
#ifdef CONFIG_NVME_HWMON
......@@ -471,8 +472,6 @@ struct nvme_ns_head {
u8 pi_type;
u8 pi_offset;
u8 guard_type;
u16 sgs;
u32 sws;
#ifdef CONFIG_BLK_DEV_ZONED
u64 zsze;
#endif
......@@ -767,6 +766,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
}
}
void nvme_end_req(struct request *req);
void nvme_complete_rq(struct request *req);
void nvme_complete_batch_req(struct request *req);
......@@ -1161,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
struct nvme_command *cmd, int status);
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
bool nvme_get_ns(struct nvme_ns *ns);
void nvme_put_ns(struct nvme_ns *ns);
static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
......
......@@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
......
......@@ -676,10 +676,18 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item,
if (kstrtobool(page, &enable))
return -EINVAL;
/*
* take a global nvmet_config_sem because the disable routine has a
* window where it releases the subsys-lock, giving a chance to
* a parallel enable to concurrently execute causing the disable to
* have a misaccounting of the ns percpu_ref.
*/
down_write(&nvmet_config_sem);
if (enable)
ret = nvmet_ns_enable(ns);
else
nvmet_ns_disable(ns);
up_write(&nvmet_config_sem);
return ret ? ret : count;
}
......
......@@ -818,6 +818,15 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
percpu_ref_exit(&sq->ref);
nvmet_auth_sq_free(sq);
/*
* we must reference the ctrl again after waiting for inflight IO
* to complete. Because admin connect may have sneaked in after we
* store sq->ctrl locally, but before we killed the percpu_ref. the
* admin connect allocates and assigns sq->ctrl, which now needs a
* final ref put, as this ctrl is going away.
*/
ctrl = sq->ctrl;
if (ctrl) {
/*
* The teardown flow may take some time, and the host may not
......
......@@ -3700,8 +3700,10 @@ static int sd_revalidate_disk(struct gendisk *disk)
*/
if (sdkp->first_scan ||
q->limits.max_sectors > q->limits.max_dev_sectors ||
q->limits.max_sectors > q->limits.max_hw_sectors)
q->limits.max_sectors > q->limits.max_hw_sectors) {
q->limits.max_sectors = rw_max;
q->limits.max_user_sectors = rw_max;
}
sdkp->first_scan = 0;
......
......@@ -66,12 +66,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q)
return q->integrity.profile;
}
static inline void blk_queue_max_integrity_segments(struct request_queue *q,
unsigned int segs)
{
q->limits.max_integrity_segments = segs;
}
static inline unsigned short
queue_max_integrity_segments(const struct request_queue *q)
{
......@@ -151,10 +145,6 @@ static inline void blk_integrity_register(struct gendisk *d,
static inline void blk_integrity_unregister(struct gendisk *d)
{
}
static inline void blk_queue_max_integrity_segments(struct request_queue *q,
unsigned int segs)
{
}
static inline unsigned short
queue_max_integrity_segments(const struct request_queue *q)
{
......
......@@ -186,6 +186,7 @@ struct gendisk {
*/
unsigned int nr_zones;
unsigned int zone_capacity;
unsigned int last_zone_capacity;
unsigned long *conv_zones_bitmap;
unsigned int zone_wplugs_hash_bits;
spinlock_t zone_wplugs_lock;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment