lab.nexedi.com will be down from Thursday, 20 March 2025, 07:30:00 UTC for a duration of approximately 2 hours

Commit 33c8846c authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.16/block-2021-10-29' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:

 - mq-deadline accounting improvements (Bart)

 - blk-wbt timer fix (Andrea)

 - Untangle the block layer includes (Christoph)

 - Rework the poll support to be bio based, which will enable adding
   support for polling for bio based drivers (Christoph)

 - Block layer core support for multi-actuator drives (Damien)

 - blk-crypto improvements (Eric)

 - Batched tag allocation support (me)

 - Request completion batching support (me)

 - Plugging improvements (me)

 - Shared tag set improvements (John)

 - Concurrent queue quiesce support (Ming)

 - Cache bdev in ->private_data for block devices (Pavel)

 - bdev dio improvements (Pavel)

 - Block device invalidation and block size improvements (Xie)

 - Various cleanups, fixes, and improvements (Christoph, Jackie,
   Masahira, Tejun, Yu, Pavel, Zheng, me)

* tag 'for-5.16/block-2021-10-29' of git://git.kernel.dk/linux-block: (174 commits)
  blk-mq-debugfs: Show active requests per queue for shared tags
  block: improve readability of blk_mq_end_request_batch()
  virtio-blk: Use blk_validate_block_size() to validate block size
  loop: Use blk_validate_block_size() to validate block size
  nbd: Use blk_validate_block_size() to validate block size
  block: Add a helper to validate the block size
  block: re-flow blk_mq_rq_ctx_init()
  block: prefetch request to be initialized
  block: pass in blk_mq_tags to blk_mq_rq_ctx_init()
  block: add rq_flags to struct blk_mq_alloc_data
  block: add async version of bio_set_polled
  block: kill DIO_MULTI_BIO
  block: kill unused polling bits in __blkdev_direct_IO()
  block: avoid extra iter advance with async iocb
  block: Add independent access ranges support
  blk-mq: don't issue request directly in case that current is to be blocked
  sbitmap: silence data race warning
  blk-cgroup: synchronize blkg creation against policy deactivation
  block: refactor bio_iov_bvec_set()
  block: add single bio async direct IO helper
  ...
parents 9ac21142 9b84c629
This diff is collapsed.
......@@ -1115,7 +1115,8 @@ export MODORDER := $(extmod_prefix)modules.order
export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
core-$(CONFIG_BLOCK) += block/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
......
......@@ -58,7 +58,7 @@ struct nfhd_device {
struct gendisk *disk;
};
static blk_qc_t nfhd_submit_bio(struct bio *bio)
static void nfhd_submit_bio(struct bio *bio)
{
struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data;
struct bio_vec bvec;
......@@ -76,7 +76,6 @@ static blk_qc_t nfhd_submit_bio(struct bio *bio)
sec += len;
}
bio_endio(bio);
return BLK_QC_T_NONE;
}
static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
......
......@@ -16,7 +16,6 @@
#include <linux/console.h>
#include <linux/memblock.h>
#include <linux/ioport.h>
#include <linux/blkdev.h>
#include <asm/bootinfo.h>
#include <asm/mach-rc32434/ddr.h>
......
......@@ -7,7 +7,6 @@
#include <linux/kernel.h>
#include <linux/linkage.h>
#include <linux/mm.h>
#include <linux/blkdev.h>
#include <linux/memblock.h>
#include <linux/pm.h>
#include <linux/smp.h>
......
......@@ -11,7 +11,6 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/memblock.h>
#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/screen_info.h>
......
......@@ -25,7 +25,6 @@
#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/blkdev.h> /* for initrd_* */
#include <linux/pagemap.h>
#include <asm/pgalloc.h>
......
......@@ -21,6 +21,7 @@
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <asm/prom.h>
......
......@@ -27,6 +27,7 @@
#include <linux/blk-mq.h>
#include <linux/ata.h>
#include <linux/hdreg.h>
#include <linux/major.h>
#include <linux/cdrom.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
......
......@@ -100,7 +100,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
spin_unlock(&dev->lock);
}
static blk_qc_t simdisk_submit_bio(struct bio *bio)
static void simdisk_submit_bio(struct bio *bio)
{
struct simdisk *dev = bio->bi_bdev->bd_disk->private_data;
struct bio_vec bvec;
......@@ -118,7 +118,6 @@ static blk_qc_t simdisk_submit_bio(struct bio *bio)
}
bio_endio(bio);
return BLK_QC_T_NONE;
}
static int simdisk_open(struct block_device *bdev, fmode_t mode)
......
......@@ -73,7 +73,7 @@ config BLK_DEV_ZONED
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
depends on BLK_CGROUP=y
depends on BLK_CGROUP
select BLK_CGROUP_RWSTAT
help
Block layer bio throttling support. It can be used to limit
......@@ -112,7 +112,7 @@ config BLK_WBT_MQ
config BLK_CGROUP_IOLATENCY
bool "Enable support for latency based cgroup IO protection"
depends on BLK_CGROUP=y
depends on BLK_CGROUP
help
Enabling this option enables the .latency interface for IO throttling.
The IO controller will attempt to maintain average IO latencies below
......@@ -132,7 +132,7 @@ config BLK_CGROUP_FC_APPID
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
depends on BLK_CGROUP=y
depends on BLK_CGROUP
select BLK_RQ_IO_DATA_LEN
select BLK_RQ_ALLOC_TIME
help
......@@ -190,39 +190,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
by falling back to the kernel crypto API when inline
encryption hardware is not present.
menu "Partition Types"
source "block/partitions/Kconfig"
endmenu
endif # BLOCK
config BLOCK_COMPAT
bool
depends on BLOCK && COMPAT
default y
def_bool COMPAT
config BLK_MQ_PCI
bool
depends on BLOCK && PCI
default y
def_bool PCI
config BLK_MQ_VIRTIO
bool
depends on BLOCK && VIRTIO
depends on VIRTIO
default y
config BLK_MQ_RDMA
bool
depends on BLOCK && INFINIBAND
depends on INFINIBAND
default y
config BLK_PM
def_bool BLOCK && PM
def_bool PM
# do not use in new code
config BLOCK_HOLDER_DEPRECATED
bool
source "block/Kconfig.iosched"
endif # BLOCK
# SPDX-License-Identifier: GPL-2.0
if BLOCK
menu "IO Schedulers"
config MQ_IOSCHED_DEADLINE
......@@ -45,5 +43,3 @@ config BFQ_CGROUP_DEBUG
files in a cgroup which can be useful for debugging.
endmenu
endif
......@@ -3,13 +3,13 @@
# Makefile for the kernel block layer
#
obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o
disk-events.o blk-ia-ranges.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
......@@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
......@@ -12,6 +12,7 @@
#include <linux/major.h>
#include <linux/device_cgroup.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
......@@ -326,12 +327,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
result = blk_queue_enter(bdev->bd_disk->queue, 0);
result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ);
blk_queue_exit(bdev->bd_disk->queue);
blk_queue_exit(bdev_get_queue(bdev));
return result;
}
......@@ -362,7 +363,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
result = blk_queue_enter(bdev->bd_disk->queue, 0);
result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result)
return result;
......@@ -375,7 +376,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
clean_page_buffers(page);
unlock_page(page);
}
blk_queue_exit(bdev->bd_disk->queue);
blk_queue_exit(bdev_get_queue(bdev));
return result;
}
......@@ -492,6 +493,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
spin_lock_init(&bdev->bd_size_lock);
bdev->bd_partno = partno;
bdev->bd_inode = inode;
bdev->bd_queue = disk->queue;
bdev->bd_stats = alloc_percpu(struct disk_stats);
if (!bdev->bd_stats) {
iput(inode);
......@@ -962,9 +964,11 @@ EXPORT_SYMBOL(blkdev_put);
* @pathname: special file representing the block device
* @dev: return value of the block device's dev_t
*
* Get a reference to the blockdevice at @pathname in the current
* namespace if possible and return it. Return ERR_PTR(error)
* otherwise.
* Lookup the block device's dev_t at @pathname in the current
* namespace if possible and return it by @dev.
*
* RETURNS:
* 0 if succeeded, errno otherwise.
*/
int lookup_bdev(const char *pathname, dev_t *dev)
{
......
......@@ -6,13 +6,13 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>
#include "elevator.h"
#include "bfq-iosched.h"
#ifdef CONFIG_BFQ_CGROUP_DEBUG
......@@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
if (blkg_rwstat_init(&stats->bytes, gfp) ||
blkg_rwstat_init(&stats->ios, gfp))
return -ENOMEM;
goto error;
#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
......@@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
bfq_stat_init(&stats->dequeue, gfp) ||
bfq_stat_init(&stats->group_wait_time, gfp) ||
bfq_stat_init(&stats->idle_time, gfp) ||
bfq_stat_init(&stats->empty_time, gfp)) {
bfqg_stats_exit(stats);
return -ENOMEM;
}
bfq_stat_init(&stats->empty_time, gfp))
goto error;
#endif
return 0;
error:
bfqg_stats_exit(stats);
return -ENOMEM;
}
static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
......
......@@ -117,7 +117,6 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
......@@ -127,6 +126,7 @@
#include <trace/events/block.h>
#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
......@@ -6884,8 +6884,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int min_shallow;
min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
}
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
......
......@@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/mempool.h>
#include <linux/export.h>
#include <linux/bio.h>
......@@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt &&
bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
&bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0;
......
......@@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size)
snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
bslab->slab = kmem_cache_create(bslab->name, size,
ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
ARCH_KMALLOC_MINALIGN,
SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
if (!bslab->slab)
goto fail_alloc_slab;
......@@ -156,7 +157,7 @@ static void bio_put_slab(struct bio_set *bs)
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{
BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
BUG_ON(nr_vecs > BIO_MAX_VECS);
if (nr_vecs == BIO_MAX_VECS)
mempool_free(bv, pool);
......@@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
bio->bi_cookie = BLK_QC_T_NONE;
bio->bi_max_vecs = max_vecs;
bio->bi_io_vec = table;
......@@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio);
* REQ_OP_READ, zero the truncated part. This function should only
* be used for handling corner cases, such as bio eod.
*/
void bio_truncate(struct bio *bio, unsigned new_size)
static void bio_truncate(struct bio *bio, unsigned new_size)
{
struct bio_vec bv;
struct bvec_iter iter;
......@@ -677,7 +679,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
void bio_put(struct bio *bio)
{
if (unlikely(bio_flagged(bio, BIO_REFFED))) {
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
BUG_ON(!atomic_read(&bio->__bi_cnt));
if (!atomic_dec_and_test(&bio->__bi_cnt))
return;
}
......@@ -772,6 +774,23 @@ const char *bio_devname(struct bio *bio, char *buf)
}
EXPORT_SYMBOL(bio_devname);
/**
* bio_full - check if the bio is full
* @bio: bio to check
* @len: length of one segment to be added
*
* Return true if @bio is full and one segment with @len bytes can't be
* added to the bio, otherwise return false
*/
static inline bool bio_full(struct bio *bio, unsigned len)
{
if (bio->bi_vcnt >= bio->bi_max_vecs)
return true;
if (bio->bi_iter.bi_size > UINT_MAX - len)
return true;
return false;
}
static inline bool page_is_mergeable(const struct bio_vec *bv,
struct page *page, unsigned int len, unsigned int off,
bool *same_page)
......@@ -791,6 +810,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
}
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
static bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) {
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
/*
* Try to merge a page into a segment, while obeying the hardware segment
* size limit. This is not for normal read/write bios, but for passthrough
......@@ -908,7 +965,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
int bio_add_zone_append_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
bool same_page = false;
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
......@@ -922,45 +979,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) {
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
EXPORT_SYMBOL_GPL(__bio_try_merge_page);
/**
* __bio_add_page - add page(s) to a bio in a new segment
* @bio: destination bio
......@@ -1015,52 +1033,40 @@ int bio_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_add_page);
void bio_release_pages(struct bio *bio, bool mark_dirty)
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct bvec_iter_all iter_all;
struct bio_vec *bvec;
if (bio_flagged(bio, BIO_NO_PAGE_REF))
return;
bio_for_each_segment_all(bvec, bio, iter_all) {
if (mark_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
}
}
EXPORT_SYMBOL_GPL(bio_release_pages);
EXPORT_SYMBOL_GPL(__bio_release_pages);
static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
size_t size = iov_iter_count(iter);
WARN_ON_ONCE(bio->bi_max_vecs);
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
size_t max_sectors = queue_max_zone_append_sectors(q);
size = min(size, max_sectors << SECTOR_SHIFT);
}
bio->bi_vcnt = iter->nr_segs;
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
bio->bi_iter.bi_bvec_done = iter->iov_offset;
bio->bi_iter.bi_size = iter->count;
bio->bi_iter.bi_size = size;
bio_set_flag(bio, BIO_NO_PAGE_REF);
bio_set_flag(bio, BIO_CLONED);
}
static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
__bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, iter->count);
return 0;
}
static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct iov_iter i = *iter;
iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
__bio_iov_bvec_set(bio, &i);
iov_iter_advance(iter, i.count);
return 0;
}
static void bio_put_pages(struct page **pages, size_t size, size_t off)
{
size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
......@@ -1130,7 +1136,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
......@@ -1202,9 +1208,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
int ret = 0;
if (iov_iter_is_bvec(iter)) {
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
return bio_iov_bvec_set_append(bio, iter);
return bio_iov_bvec_set(bio, iter);
bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, bio->bi_iter.bi_size);
return 0;
}
do {
......@@ -1260,18 +1266,7 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
/**
* bio_advance - increment/complete a bio by some number of bytes
* @bio: bio to advance
* @bytes: number of bytes to complete
*
* This updates bi_sector, bi_size and bi_idx; if the number of bytes to
* complete doesn't align with a bvec boundary, then bv_len and bv_offset will
* be updated on the last bvec as well.
*
* @bio will then represent the remaining, uncompleted portion of the io.
*/
void bio_advance(struct bio *bio, unsigned bytes)
void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
bio_integrity_advance(bio, bytes);
......@@ -1279,7 +1274,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
bio_crypt_advance(bio, bytes);
bio_advance_iter(bio, &bio->bi_iter, bytes);
}
EXPORT_SYMBOL(bio_advance);
EXPORT_SYMBOL(__bio_advance);
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
......@@ -1467,10 +1462,10 @@ void bio_endio(struct bio *bio)
return;
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
}
......
......@@ -32,6 +32,7 @@
#include <linux/psi.h>
#include "blk.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
......@@ -620,7 +621,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp)
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
char *input, struct blkg_conf_ctx *ctx)
__acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
__acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
{
struct block_device *bdev;
struct request_queue *q;
......@@ -631,7 +632,15 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
q = bdev->bd_disk->queue;
q = bdev_get_queue(bdev);
/*
* blkcg_deactivate_policy() requires queue to be frozen, we can grab
* q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
*/
ret = blk_queue_enter(q, 0);
if (ret)
return ret;
rcu_read_lock();
spin_lock_irq(&q->queue_lock);
......@@ -702,6 +711,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
goto success;
}
success:
blk_queue_exit(q);
ctx->bdev = bdev;
ctx->blkg = blkg;
ctx->body = input;
......@@ -714,6 +724,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
rcu_read_unlock();
fail:
blkdev_put_no_open(bdev);
blk_queue_exit(q);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
......@@ -736,9 +747,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
* with blkg_conf_prep().
*/
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
__releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
__releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
{
spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
rcu_read_unlock();
blkdev_put_no_open(ctx->bdev);
}
......@@ -841,7 +852,7 @@ static void blkcg_fill_root_iostats(void)
while ((dev = class_dev_iter_next(&iter))) {
struct block_device *bdev = dev_to_bdev(dev);
struct blkcg_gq *blkg =
blk_queue_root_blkg(bdev->bd_disk->queue);
blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkg_iostat tmp;
int cpu;
......@@ -1800,7 +1811,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
rcu_read_lock();
blkg = blkg_lookup_create(css_to_blkcg(css),
bio->bi_bdev->bd_disk->queue);
bdev_get_queue(bio->bi_bdev));
while (blkg) {
if (blkg_tryget(blkg)) {
ret_blkg = blkg;
......@@ -1836,8 +1847,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
if (css && css->parent) {
bio->bi_blkg = blkg_tryget_closest(bio, css);
} else {
blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
}
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
......
This diff is collapsed.
This diff is collapsed.
......@@ -7,7 +7,7 @@
#define __LINUX_BLK_CRYPTO_INTERNAL_H
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
/* Represents a crypto mode supported by blk-crypto */
struct blk_crypto_mode {
......
This diff is collapsed.
......@@ -11,7 +11,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/keyslot-manager.h>
#include <linux/blk-crypto-profile.h>
#include <linux/module.h>
#include <linux/slab.h>
......@@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio)
blk_status_t __blk_crypto_init_request(struct request *rq)
{
return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
&rq->crypt_keyslot);
return blk_crypto_get_keyslot(rq->q->crypto_profile,
rq->crypt_ctx->bc_key,
&rq->crypt_keyslot);
}
/**
......@@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq)
*/
void __blk_crypto_free_request(struct request *rq)
{
blk_ksm_put_slot(rq->crypt_keyslot);
blk_crypto_put_keyslot(rq->crypt_keyslot);
mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
blk_crypto_rq_set_defaults(rq);
}
......@@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
struct blk_crypto_profile *profile;
/* Error if bio has no data. */
if (WARN_ON_ONCE(!bio_has_data(bio))) {
......@@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API.
*/
if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
&bc_key->crypto_cfg))
profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
return true;
if (blk_crypto_fallback_bio_prep(bio_ptr))
......@@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
const struct blk_crypto_config *cfg)
{
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
blk_ksm_crypto_cfg_supported(q->ksm, cfg);
__blk_crypto_cfg_supported(q->crypto_profile, cfg);
}
/**
......@@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
int blk_crypto_start_using_key(const struct blk_crypto_key *key,
struct request_queue *q)
{
if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
return 0;
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
......@@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
* evicted from any hardware that it might have been programmed into. The key
* must not be in use by any in-flight IO when this function is called.
*
* Return: 0 on success or if key is not present in the q's ksm, -err on error.
* Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
*/
int blk_crypto_evict_key(struct request_queue *q,
const struct blk_crypto_key *key)
{
if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
return blk_ksm_evict_key(q->ksm, key);
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
return __blk_crypto_evict_key(q->crypto_profile, key);
/*
* If the request queue's associated inline encryption hardware didn't
* have support for the key, then the key might have been programmed
* into the fallback keyslot manager, so try to evict from there.
* If the request_queue didn't support the key, then blk-crypto-fallback
* may have been used, so try to evict the key from blk-crypto-fallback.
*/
return blk_crypto_fallback_evict_key(key);
}
......
......@@ -65,13 +65,19 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
static bool blk_rq_is_poll(struct request *rq)
{
return rq->mq_hctx && rq->mq_hctx->type == HCTX_TYPE_POLL;
if (!rq->mq_hctx)
return false;
if (rq->mq_hctx->type != HCTX_TYPE_POLL)
return false;
if (WARN_ON_ONCE(!rq->bio))
return false;
return true;
}
static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
{
do {
blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), true);
bio_poll(rq->bio, NULL, 0);
cond_resched();
} while (!completion_done(wait));
}
......
......@@ -379,7 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*/
void blk_insert_flush(struct request *rq)
bool blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags; /* may change, cache */
......@@ -409,7 +409,7 @@ void blk_insert_flush(struct request *rq)
*/
if (!policy) {
blk_mq_end_request(rq, 0);
return;
return true;
}
BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
......@@ -420,10 +420,8 @@ void blk_insert_flush(struct request *rq)
* for normal execution.
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
blk_mq_request_bypass_insert(rq, false, false);
return;
}
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))
return false;
/*
* @rq should go through flush machinery. Mark it part of flush
......@@ -439,6 +437,8 @@ void blk_insert_flush(struct request *rq)
spin_lock_irq(&fq->mq_flush_lock);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&fq->mq_flush_lock);
return true;
}
/**
......
// SPDX-License-Identifier: GPL-2.0
/*
* Block device concurrent positioning ranges.
*
* Copyright (C) 2021 Western Digital Corporation or its Affiliates.
*/
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/init.h>
#include "blk.h"
static ssize_t
blk_ia_range_sector_show(struct blk_independent_access_range *iar,
char *buf)
{
return sprintf(buf, "%llu\n", iar->sector);
}
static ssize_t
blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar,
char *buf)
{
return sprintf(buf, "%llu\n", iar->nr_sectors);
}
struct blk_ia_range_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct blk_independent_access_range *iar, char *buf);
};
static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
.attr = { .name = "sector", .mode = 0444 },
.show = blk_ia_range_sector_show,
};
static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
.attr = { .name = "nr_sectors", .mode = 0444 },
.show = blk_ia_range_nr_sectors_show,
};
static struct attribute *blk_ia_range_attrs[] = {
&blk_ia_range_sector_entry.attr,
&blk_ia_range_nr_sectors_entry.attr,
NULL,
};
ATTRIBUTE_GROUPS(blk_ia_range);
static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct blk_ia_range_sysfs_entry *entry =
container_of(attr, struct blk_ia_range_sysfs_entry, attr);
struct blk_independent_access_range *iar =
container_of(kobj, struct blk_independent_access_range, kobj);
ssize_t ret;
mutex_lock(&iar->queue->sysfs_lock);
ret = entry->show(iar, buf);
mutex_unlock(&iar->queue->sysfs_lock);
return ret;
}
static const struct sysfs_ops blk_ia_range_sysfs_ops = {
.show = blk_ia_range_sysfs_show,
};
/*
* Independent access range entries are not freed individually, but alltogether
* with struct blk_independent_access_ranges and its array of ranges. Since
* kobject_add() takes a reference on the parent kobject contained in
* struct blk_independent_access_ranges, the array of independent access range
* entries cannot be freed until kobject_del() is called for all entries.
* So we do not need to do anything here, but still need this no-op release
* operation to avoid complaints from the kobject code.
*/
static void blk_ia_range_sysfs_nop_release(struct kobject *kobj)
{
}
static struct kobj_type blk_ia_range_ktype = {
.sysfs_ops = &blk_ia_range_sysfs_ops,
.default_groups = blk_ia_range_groups,
.release = blk_ia_range_sysfs_nop_release,
};
/*
* This will be executed only after all independent access range entries are
* removed with kobject_del(), at which point, it is safe to free everything,
* including the array of ranges.
*/
static void blk_ia_ranges_sysfs_release(struct kobject *kobj)
{
struct blk_independent_access_ranges *iars =
container_of(kobj, struct blk_independent_access_ranges, kobj);
kfree(iars);
}
static struct kobj_type blk_ia_ranges_ktype = {
.release = blk_ia_ranges_sysfs_release,
};
/**
* disk_register_ia_ranges - register with sysfs a set of independent
* access ranges
* @disk: Target disk
* @new_iars: New set of independent access ranges
*
* Register with sysfs a set of independent access ranges for @disk.
* If @new_iars is not NULL, this set of ranges is registered and the old set
* specified by q->ia_ranges is unregistered. Otherwise, q->ia_ranges is
* registered if it is not already.
*/
int disk_register_independent_access_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *new_iars)
{
struct request_queue *q = disk->queue;
struct blk_independent_access_ranges *iars;
int i, ret;
lockdep_assert_held(&q->sysfs_dir_lock);
lockdep_assert_held(&q->sysfs_lock);
/* If a new range set is specified, unregister the old one */
if (new_iars) {
if (q->ia_ranges)
disk_unregister_independent_access_ranges(disk);
q->ia_ranges = new_iars;
}
iars = q->ia_ranges;
if (!iars)
return 0;
/*
* At this point, iars is the new set of sector access ranges that needs
* to be registered with sysfs.
*/
WARN_ON(iars->sysfs_registered);
ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
&q->kobj, "%s", "independent_access_ranges");
if (ret) {
q->ia_ranges = NULL;
kfree(iars);
return ret;
}
for (i = 0; i < iars->nr_ia_ranges; i++) {
iars->ia_range[i].queue = q;
ret = kobject_init_and_add(&iars->ia_range[i].kobj,
&blk_ia_range_ktype, &iars->kobj,
"%d", i);
if (ret) {
while (--i >= 0)
kobject_del(&iars->ia_range[i].kobj);
kobject_del(&iars->kobj);
kobject_put(&iars->kobj);
return ret;
}
}
iars->sysfs_registered = true;
return 0;
}
void disk_unregister_independent_access_ranges(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct blk_independent_access_ranges *iars = q->ia_ranges;
int i;
lockdep_assert_held(&q->sysfs_dir_lock);
lockdep_assert_held(&q->sysfs_lock);
if (!iars)
return;
if (iars->sysfs_registered) {
for (i = 0; i < iars->nr_ia_ranges; i++)
kobject_del(&iars->ia_range[i].kobj);
kobject_del(&iars->kobj);
kobject_put(&iars->kobj);
} else {
kfree(iars);
}
q->ia_ranges = NULL;
}
static struct blk_independent_access_range *
disk_find_ia_range(struct blk_independent_access_ranges *iars,
sector_t sector)
{
struct blk_independent_access_range *iar;
int i;
for (i = 0; i < iars->nr_ia_ranges; i++) {
iar = &iars->ia_range[i];
if (sector >= iar->sector &&
sector < iar->sector + iar->nr_sectors)
return iar;
}
return NULL;
}
static bool disk_check_ia_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *iars)
{
struct blk_independent_access_range *iar, *tmp;
sector_t capacity = get_capacity(disk);
sector_t sector = 0;
int i;
/*
* While sorting the ranges in increasing LBA order, check that the
* ranges do not overlap, that there are no sector holes and that all
* sectors belong to one range.
*/
for (i = 0; i < iars->nr_ia_ranges; i++) {
tmp = disk_find_ia_range(iars, sector);
if (!tmp || tmp->sector != sector) {
pr_warn("Invalid non-contiguous independent access ranges\n");
return false;
}
iar = &iars->ia_range[i];
if (tmp != iar) {
swap(iar->sector, tmp->sector);
swap(iar->nr_sectors, tmp->nr_sectors);
}
sector += iar->nr_sectors;
}
if (sector != capacity) {
pr_warn("Independent access ranges do not match disk capacity\n");
return false;
}
return true;
}
static bool disk_ia_ranges_changed(struct gendisk *disk,
struct blk_independent_access_ranges *new)
{
struct blk_independent_access_ranges *old = disk->queue->ia_ranges;
int i;
if (!old)
return true;
if (old->nr_ia_ranges != new->nr_ia_ranges)
return true;
for (i = 0; i < old->nr_ia_ranges; i++) {
if (new->ia_range[i].sector != old->ia_range[i].sector ||
new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors)
return true;
}
return false;
}
/**
* disk_alloc_independent_access_ranges - Allocate an independent access ranges
* data structure
* @disk: target disk
* @nr_ia_ranges: Number of independent access ranges
*
* Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges
* access range descriptors.
*/
struct blk_independent_access_ranges *
disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges)
{
struct blk_independent_access_ranges *iars;
iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges),
GFP_KERNEL, disk->queue->node);
if (iars)
iars->nr_ia_ranges = nr_ia_ranges;
return iars;
}
EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges);
/**
* disk_set_independent_access_ranges - Set a disk independent access ranges
* @disk: target disk
* @iars: independent access ranges structure
*
* Set the independent access ranges information of the request queue
* of @disk to @iars. If @iars is NULL and the independent access ranges
* structure already set is cleared. If there are no differences between
* @iars and the independent access ranges structure already set, @iars
* is freed.
*/
void disk_set_independent_access_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *iars)
{
struct request_queue *q = disk->queue;
if (WARN_ON_ONCE(iars && !iars->nr_ia_ranges)) {
kfree(iars);
iars = NULL;
}
mutex_lock(&q->sysfs_dir_lock);
mutex_lock(&q->sysfs_lock);
if (iars) {
if (!disk_check_ia_ranges(disk, iars)) {
kfree(iars);
iars = NULL;
goto reg;
}
if (!disk_ia_ranges_changed(disk, iars)) {
kfree(iars);
goto unlock;
}
}
/*
* This may be called for a registered queue. E.g. during a device
* revalidation. If that is the case, we need to unregister the old
* set of independent access ranges and register the new set. If the
* queue is not registered, registration of the device request queue
* will register the independent access ranges, so only swap in the
* new set and free the old one.
*/
reg:
if (blk_queue_registered(q)) {
disk_register_independent_access_ranges(disk, iars);
} else {
swap(q->ia_ranges, iars);
kfree(iars);
}
unlock:
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
}
EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges);
......@@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
......@@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
if (disk->queue->ksm) {
if (disk->queue->crypto_profile) {
pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
blk_ksm_unregister(disk->queue);
blk_crypto_unregister(disk->queue);
}
#endif
}
......
......@@ -3165,12 +3165,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
ioc = q_to_ioc(bdev->bd_disk->queue);
ioc = q_to_ioc(bdev_get_queue(bdev));
if (!ioc) {
ret = blk_iocost_init(bdev->bd_disk->queue);
ret = blk_iocost_init(bdev_get_queue(bdev));
if (ret)
goto err;
ioc = q_to_ioc(bdev->bd_disk->queue);
ioc = q_to_ioc(bdev_get_queue(bdev));
}
spin_lock_irq(&ioc->lock);
......@@ -3332,12 +3332,12 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
ioc = q_to_ioc(bdev->bd_disk->queue);
ioc = q_to_ioc(bdev_get_queue(bdev));
if (!ioc) {
ret = blk_iocost_init(bdev->bd_disk->queue);
ret = blk_iocost_init(bdev_get_queue(bdev));
if (ret)
goto err;
ioc = q_to_ioc(bdev->bd_disk->queue);
ioc = q_to_ioc(bdev_get_queue(bdev));
}
spin_lock_irq(&ioc->lock);
......
......@@ -74,6 +74,7 @@
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include <linux/blk-cgroup.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk.h"
......
......@@ -6,12 +6,45 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
#include "blk-throttle.h"
static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
{
*bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
}
static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
{
struct bvec_iter iter = bio->bi_iter;
int idx;
bio_get_first_bvec(bio, bv);
if (bv->bv_len == bio->bi_iter.bi_size)
return; /* this bio only has a single bvec */
bio_advance_iter(bio, &iter, iter.bi_size);
if (!iter.bi_bvec_done)
idx = iter.bi_idx - 1;
else /* in the middle of bvec */
idx = iter.bi_idx;
*bv = bio->bi_io_vec[idx];
/*
* iter.bi_bvec_done records actual length of the last bvec
* if this bio ends in the middle of one io vector
*/
if (iter.bi_bvec_done)
bv->bv_len = iter.bi_bvec_done;
}
static inline bool bio_will_gap(struct request_queue *q,
struct request *prev_rq, struct bio *prev, struct bio *next)
......@@ -285,13 +318,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
* iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_hipri(bio);
bio_clear_polled(bio);
return bio_split(bio, sectors, GFP_NOIO, bs);
}
/**
* __blk_queue_split - split a bio and submit the second half
* @q: [in] request_queue new bio is being queued at
* @bio: [in, out] bio to be split
* @nr_segs: [out] number of segments in the first bio
*
......@@ -302,9 +335,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
* of the caller to ensure that q->bio_split is only released after processing
* of the split bio has finished.
*/
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
void __blk_queue_split(struct request_queue *q, struct bio **bio,
unsigned int *nr_segs)
{
struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
struct bio *split = NULL;
switch (bio_op(*bio)) {
......@@ -321,21 +354,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
nr_segs);
break;
default:
/*
* All drivers must accept single-segments bios that are <=
* PAGE_SIZE. This is a quick and dirty check that relies on
* the fact that bi_io_vec[0] is always valid if a bio has data.
* The check might lead to occasional false negatives when bios
* are cloned, but compared to the performance impact of cloned
* bios themselves the loop below doesn't matter anyway.
*/
if (!q->limits.chunk_sectors &&
(*bio)->bi_vcnt == 1 &&
((*bio)->bi_io_vec[0].bv_len +
(*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
*nr_segs = 1;
break;
}
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
break;
}
......@@ -365,9 +383,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
*/
void blk_queue_split(struct bio **bio)
{
struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
unsigned int nr_segs;
__blk_queue_split(bio, &nr_segs);
if (blk_may_split(q, *bio))
__blk_queue_split(q, bio, &nr_segs);
}
EXPORT_SYMBOL(blk_queue_split);
......@@ -558,6 +578,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
return queue_max_segments(rq->q);
}
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
struct request_queue *q = rq->q;
if (blk_rq_is_passthrough(rq))
return q->limits.max_hw_sectors;
if (!q->limits.chunk_sectors ||
req_op(rq) == REQ_OP_DISCARD ||
req_op(rq) == REQ_OP_SECURE_ERASE)
return blk_queue_get_max_sectors(q, req_op(rq));
return min(blk_max_size_offset(q, offset, 0),
blk_queue_get_max_sectors(q, req_op(rq)));
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
......@@ -718,6 +755,13 @@ static enum elv_merge blk_try_req_merge(struct request *req,
return ELEVATOR_NO_MERGE;
}
static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
{
if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
return true;
return false;
}
/*
* For non-mq, this has to be called with the request spinlock acquired.
* For mq with scheduling, the appropriate queue wide lock should be held.
......@@ -1023,12 +1067,11 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
* @same_queue_rq: output value, will be true if there's an existing request
* from the passed in @q already in the plug list
*
* Determine whether @bio being queued on @q can be merged with a request
* on %current's plugged list. Returns %true if merge was successful,
* Determine whether @bio being queued on @q can be merged with the previous
* request on %current's plugged list. Returns %true if merge was successful,
* otherwise %false.
*
* Plugging coalesces IOs from the same issuer for the same purpose without
......@@ -1041,36 +1084,26 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq)
unsigned int nr_segs, bool *same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
struct list_head *plug_list;
plug = blk_mq_plug(q, bio);
if (!plug)
if (!plug || rq_list_empty(plug->mq_list))
return false;
plug_list = &plug->mq_list;
list_for_each_entry_reverse(rq, plug_list, queuelist) {
if (rq->q == q && same_queue_rq) {
/*
* Only blk-mq multiple hardware queues case checks the
* rq in the same queue, there should be only one such
* rq in a queue
**/
*same_queue_rq = rq;
}
if (rq->q != q)
continue;
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
BIO_MERGE_OK)
return true;
/* check the previously added entry for a quick merge attempt */
rq = rq_list_peek(&plug->mq_list);
if (rq->q == q) {
/*
* Only blk-mq multiple hardware queues case checks the rq in
* the same queue, there should be only one such rq in a queue
*/
*same_queue_rq = true;
}
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK)
return true;
return false;
}
......
......@@ -287,7 +287,7 @@ static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOWAIT),
CMD_FLAG_NAME(NOUNMAP),
CMD_FLAG_NAME(HIPRI),
CMD_FLAG_NAME(POLLED),
};
#undef CMD_FLAG_NAME
......@@ -453,11 +453,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
atomic_read(&tags->active_queues));
seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(tags->bitmap_tags, m);
sbitmap_queue_show(&tags->bitmap_tags, m);
if (tags->nr_reserved_tags) {
seq_puts(m, "\nbreserved_tags:\n");
sbitmap_queue_show(tags->breserved_tags, m);
sbitmap_queue_show(&tags->breserved_tags, m);
}
}
......@@ -488,7 +488,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
if (res)
goto out;
if (hctx->tags)
sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
mutex_unlock(&q->sysfs_lock);
out:
......@@ -522,77 +522,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
if (res)
goto out;
if (hctx->sched_tags)
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
mutex_unlock(&q->sysfs_lock);
out:
return res;
}
static int hctx_io_poll_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "considered=%lu\n", hctx->poll_considered);
seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
seq_printf(m, "success=%lu\n", hctx->poll_success);
return 0;
}
static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
return count;
}
static int hctx_dispatched_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
int i;
seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
unsigned int d = 1U << (i - 1);
seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
}
seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
return 0;
}
static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
int i;
for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
hctx->dispatched[i] = 0;
return count;
}
static int hctx_queued_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "%lu\n", hctx->queued);
return 0;
}
static ssize_t hctx_queued_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
hctx->queued = 0;
return count;
}
static int hctx_run_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
......@@ -614,7 +550,7 @@ static int hctx_active_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
return 0;
}
......@@ -663,57 +599,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
static int ctx_dispatched_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
return 0;
}
static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
return count;
}
static int ctx_merged_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu\n", ctx->rq_merged);
return 0;
}
static ssize_t ctx_merged_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_merged = 0;
return count;
}
static int ctx_completed_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
return 0;
}
static ssize_t ctx_completed_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
return count;
}
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{
const struct blk_mq_debugfs_attr *attr = m->private;
......@@ -789,9 +674,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"tags_bitmap", 0400, hctx_tags_bitmap_show},
{"sched_tags", 0400, hctx_sched_tags_show},
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
{"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
{"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
{"queued", 0600, hctx_queued_show, hctx_queued_write},
{"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
......@@ -803,9 +685,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
{"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
{"merged", 0600, ctx_merged_show, ctx_merged_write},
{"completed", 0600, ctx_completed_show, ctx_completed_write},
{},
};
......
......@@ -57,10 +57,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
}
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
return;
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
......@@ -363,7 +361,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
}
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct elevator_queue *e = q->elevator;
......@@ -389,13 +387,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
* potentially merge with. Currently includes a hand-wavy stop
* count of 8, to not spend too much time checking for merges.
*/
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
ctx->rq_merged++;
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
ret = true;
}
spin_unlock(&ctx->lock);
return ret;
}
......@@ -515,83 +510,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
percpu_ref_put(&q->q_usage_counter);
}
static int blk_mq_sched_alloc_tags(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
struct blk_mq_tag_set *set = q->tag_set;
int ret;
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
hctx->sched_tags = q->sched_shared_tags;
return 0;
}
hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
q->nr_requests);
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
set->reserved_tags, set->flags);
if (!hctx->sched_tags)
return -ENOMEM;
return 0;
}
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
if (ret) {
blk_mq_free_rq_map(hctx->sched_tags, set->flags);
hctx->sched_tags = NULL;
}
return ret;
static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
{
blk_mq_free_rq_map(queue->sched_shared_tags);
queue->sched_shared_tags = NULL;
}
/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q)
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags) {
blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
if (!blk_mq_is_shared_tags(flags))
blk_mq_free_rq_map(hctx->sched_tags);
hctx->sched_tags = NULL;
}
}
if (blk_mq_is_shared_tags(flags))
blk_mq_exit_sched_shared_tags(q);
}
static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{
struct blk_mq_tag_set *set = queue->tag_set;
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
struct blk_mq_hw_ctx *hctx;
int ret, i;
/*
* Set initial depth at max so that we don't need to reallocate for
* updating nr_requests.
*/
ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
&queue->sched_breserved_tags,
MAX_SCHED_RQ, set->reserved_tags,
set->numa_node, alloc_policy);
if (ret)
return ret;
queue_for_each_hw_ctx(queue, hctx, i) {
hctx->sched_tags->bitmap_tags =
&queue->sched_bitmap_tags;
hctx->sched_tags->breserved_tags =
&queue->sched_breserved_tags;
}
queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
BLK_MQ_NO_HCTX_IDX,
MAX_SCHED_RQ);
if (!queue->sched_shared_tags)
return -ENOMEM;
sbitmap_queue_resize(&queue->sched_bitmap_tags,
queue->nr_requests - set->reserved_tags);
blk_mq_tag_update_sched_shared_tags(queue);
return 0;
}
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
{
sbitmap_queue_free(&queue->sched_bitmap_tags);
sbitmap_queue_free(&queue->sched_breserved_tags);
}
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
unsigned int i, flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
unsigned int i;
int ret;
if (!e) {
......@@ -606,23 +589,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
* Additionally, this is a per-hw queue depth.
*/
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
BLKDEV_MAX_RQ);
BLKDEV_DEFAULT_RQ);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_tags(q, hctx, i);
if (blk_mq_is_shared_tags(flags)) {
ret = blk_mq_init_sched_shared_tags(q);
if (ret)
goto err_free_tags;
return ret;
}
if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
ret = blk_mq_init_sched_shared_sbitmap(q);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
if (ret)
goto err_free_tags;
goto err_free_map_and_rqs;
}
ret = e->ops.init_sched(q, e);
if (ret)
goto err_free_sbitmap;
goto err_free_map_and_rqs;
blk_mq_debugfs_register_sched(q);
......@@ -631,7 +614,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
ret = e->ops.init_hctx(hctx, i);
if (ret) {
eq = q->elevator;
blk_mq_sched_free_requests(q);
blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
......@@ -642,12 +625,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return 0;
err_free_sbitmap:
if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
blk_mq_exit_sched_shared_sbitmap(q);
err_free_tags:
blk_mq_sched_free_requests(q);
blk_mq_sched_tags_teardown(q);
err_free_map_and_rqs:
blk_mq_sched_free_rqs(q);
blk_mq_sched_tags_teardown(q, flags);
q->elevator = NULL;
return ret;
}
......@@ -656,14 +637,20 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
* called in either blk_queue_cleanup or elevator_switch, tagset
* is required for freeing requests
*/
void blk_mq_sched_free_requests(struct request_queue *q)
void blk_mq_sched_free_rqs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
BLK_MQ_NO_HCTX_IDX);
} else {
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set,
hctx->sched_tags, i);
}
}
}
......@@ -684,8 +671,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
blk_mq_debugfs_unregister_sched(q);
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q);
if (blk_mq_is_sbitmap_shared(flags))
blk_mq_exit_sched_shared_sbitmap(q);
blk_mq_sched_tags_teardown(q, flags);
q->elevator = NULL;
}
......@@ -2,21 +2,22 @@
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H
#include "elevator.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
void blk_mq_sched_assign_ioc(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async);
......@@ -28,45 +29,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_requests(struct request_queue *q);
void blk_mq_sched_free_rqs(struct request_queue *q);
static inline bool
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (blk_queue_nomerges(q) || !bio_mergeable(bio))
return false;
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
__blk_mq_sched_restart(hctx);
}
return __blk_mq_sched_bio_merge(q, bio, nr_segs);
static inline bool bio_mergeable(struct bio *bio)
{
return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
}
static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
struct elevator_queue *e = q->elevator;
if (e && e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = q->elevator;
if (e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
}
return true;
}
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
struct elevator_queue *e = rq->q->elevator;
if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = rq->q->elevator;
if (e && e->type->ops.completed_request)
e->type->ops.completed_request(rq, now);
if (e->type->ops.completed_request)
e->type->ops.completed_request(rq, now);
}
}
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
if (rq->rq_flags & RQF_ELV) {
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
e->type->ops.requeue_request(rq);
if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
e->type->ops.requeue_request(rq);
}
}
static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
......
......@@ -24,13 +24,12 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
!test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
atomic_inc(&set->active_queues_shared_sbitmap);
atomic_inc(&hctx->tags->active_queues);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
......@@ -45,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
sbitmap_queue_wake_all(tags->bitmap_tags);
sbitmap_queue_wake_all(&tags->bitmap_tags);
if (include_reserve)
sbitmap_queue_wake_all(tags->breserved_tags);
sbitmap_queue_wake_all(&tags->breserved_tags);
}
/*
......@@ -57,20 +56,20 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
&q->queue_flags))
return;
atomic_dec(&set->active_queues_shared_sbitmap);
} else {
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
atomic_dec(&tags->active_queues);
}
atomic_dec(&tags->active_queues);
blk_mq_tag_wakeup_all(tags, false);
}
......@@ -87,6 +86,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
return __sbitmap_queue_get(bt);
}
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
unsigned int *offset)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct sbitmap_queue *bt = &tags->bitmap_tags;
unsigned long ret;
if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
return 0;
ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
*offset += tags->nr_reserved_tags;
return ret;
}
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
......@@ -101,10 +115,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
WARN_ON_ONCE(1);
return BLK_MQ_NO_TAG;
}
bt = tags->breserved_tags;
bt = &tags->breserved_tags;
tag_offset = 0;
} else {
bt = tags->bitmap_tags;
bt = &tags->bitmap_tags;
tag_offset = tags->nr_reserved_tags;
}
......@@ -150,9 +164,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
bt = tags->breserved_tags;
bt = &tags->breserved_tags;
else
bt = tags->bitmap_tags;
bt = &tags->bitmap_tags;
/*
* If destination hw queue is changed, fake wake up on
......@@ -186,13 +200,19 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags);
sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
} else {
BUG_ON(tag >= tags->nr_reserved_tags);
sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
}
}
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
{
sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
tag_array, nr_tags);
}
struct bt_iter_data {
struct blk_mq_hw_ctx *hctx;
busy_iter_fn *fn;
......@@ -340,9 +360,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
if (tags->nr_reserved_tags)
bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
flags | BT_TAG_ITER_RESERVED);
bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
}
/**
......@@ -379,9 +399,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
int i;
unsigned int flags = tagset->flags;
int i, nr_tags;
nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
for (i = 0; i < tagset->nr_hw_queues; i++) {
for (i = 0; i < nr_tags; i++) {
if (tagset->tags && tagset->tags[i])
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED);
......@@ -459,8 +482,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
continue;
if (tags->nr_reserved_tags)
bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
}
blk_queue_exit(q);
}
......@@ -492,56 +515,10 @@ int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
return -ENOMEM;
}
static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
int node, int alloc_policy)
{
int ret;
ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
&tags->__breserved_tags,
tags->nr_tags, tags->nr_reserved_tags,
node, alloc_policy);
if (ret)
return ret;
tags->bitmap_tags = &tags->__bitmap_tags;
tags->breserved_tags = &tags->__breserved_tags;
return 0;
}
int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
{
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
int i, ret;
ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
set->queue_depth, set->reserved_tags,
set->numa_node, alloc_policy);
if (ret)
return ret;
for (i = 0; i < set->nr_hw_queues; i++) {
struct blk_mq_tags *tags = set->tags[i];
tags->bitmap_tags = &set->__bitmap_tags;
tags->breserved_tags = &set->__breserved_tags;
}
return 0;
}
void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
{
sbitmap_queue_free(&set->__bitmap_tags);
sbitmap_queue_free(&set->__breserved_tags);
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags,
int node, unsigned int flags)
int node, int alloc_policy)
{
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) {
......@@ -557,22 +534,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
if (blk_mq_is_sbitmap_shared(flags))
return tags;
if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
total_tags, reserved_tags, node,
alloc_policy) < 0) {
kfree(tags);
return NULL;
}
return tags;
}
void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
void blk_mq_free_tags(struct blk_mq_tags *tags)
{
if (!blk_mq_is_sbitmap_shared(flags)) {
sbitmap_queue_free(tags->bitmap_tags);
sbitmap_queue_free(tags->breserved_tags);
}
sbitmap_queue_free(&tags->bitmap_tags);
sbitmap_queue_free(&tags->breserved_tags);
kfree(tags);
}
......@@ -592,7 +566,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set;
struct blk_mq_tags *new;
bool ret;
if (!can_grow)
return -EINVAL;
......@@ -604,34 +577,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > MAX_SCHED_RQ)
return -EINVAL;
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
tags->nr_reserved_tags, set->flags);
/*
* Only the sbitmap needs resizing since we allocated the max
* initially.
*/
if (blk_mq_is_shared_tags(set->flags))
return 0;
new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
if (ret) {
blk_mq_free_rq_map(new, set->flags);
return -ENOMEM;
}
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
blk_mq_free_rq_map(*tagsptr, set->flags);
blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
*tagsptr = new;
} else {
/*
* Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing.
*/
sbitmap_queue_resize(tags->bitmap_tags,
sbitmap_queue_resize(&tags->bitmap_tags,
tdepth - tags->nr_reserved_tags);
}
return 0;
}
void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
{
struct blk_mq_tags *tags = set->shared_tags;
sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
}
void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
{
sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
q->nr_requests - q->tag_set->reserved_tags);
}
/**
......
......@@ -2,52 +2,30 @@
#ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H
/*
* Tag address space map.
*/
struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;
atomic_t active_queues;
struct sbitmap_queue *bitmap_tags;
struct sbitmap_queue *breserved_tags;
struct sbitmap_queue __bitmap_tags;
struct sbitmap_queue __breserved_tags;
struct request **rqs;
struct request **static_rqs;
struct list_head page_list;
/*
* used to clear request reference in rqs[] before freeing one
* request pool
*/
spinlock_t lock;
};
struct blk_mq_alloc_data;
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
unsigned int reserved_tags,
int node, unsigned int flags);
extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
int node, int alloc_policy);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
struct sbitmap_queue *breserved_tags,
unsigned int queue_depth,
unsigned int reserved,
int node, int alloc_policy);
extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
unsigned int *offset);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag);
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tags,
unsigned int depth, bool can_grow);
extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
unsigned int size);
extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
......
This diff is collapsed.
......@@ -25,18 +25,14 @@ struct blk_mq_ctx {
unsigned short index_hw[HCTX_MAX_TYPES];
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
unsigned long rq_merged;
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
struct request_queue *queue;
struct blk_mq_ctxs *ctxs;
struct kobject kobj;
} ____cacheline_aligned_in_smp;
void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
......@@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq);
*/
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
unsigned int reserved_tags,
unsigned int flags);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth);
void blk_mq_free_rq_map(struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int depth);
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags,
unsigned int hctx_idx);
/*
* Internal helpers for request insertion into sw queues
*/
......@@ -109,9 +102,9 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
* The caller ensure that if REQ_HIPRI, poll must be enabled.
* The caller ensure that if REQ_POLLED, poll must be enabled.
*/
if (flags & REQ_HIPRI)
if (flags & REQ_POLLED)
type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
......@@ -128,6 +121,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_release(struct request_queue *q);
......@@ -154,23 +149,27 @@ struct blk_mq_alloc_data {
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
unsigned int cmd_flags;
unsigned int rq_flags;
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
struct request **cached_rq;
/* input & output parameter */
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
};
static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
static inline bool blk_mq_is_shared_tags(unsigned int flags)
{
return flags & BLK_MQ_F_TAG_HCTX_SHARED;
}
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
if (data->q->elevator)
return data->hctx->sched_tags;
return data->hctx->tags;
if (!(data->rq_flags & RQF_ELV))
return data->hctx->tags;
return data->hctx->sched_tags;
}
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
......@@ -220,24 +219,24 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
if (blk_mq_is_shared_tags(hctx->flags))
atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
else
atomic_inc(&hctx->nr_active);
}
static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
if (blk_mq_is_shared_tags(hctx->flags))
atomic_dec(&hctx->queue->nr_active_requests_shared_tags);
else
atomic_dec(&hctx->nr_active);
}
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
if (blk_mq_is_shared_tags(hctx->flags))
return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
return atomic_read(&hctx->nr_active);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
......@@ -260,7 +259,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
}
bool blk_mq_get_driver_tag(struct request *rq);
bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
static inline bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->tag != BLK_MQ_NO_TAG &&
!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
hctx->tags->rqs[rq->tag] = rq;
return true;
}
return __blk_mq_get_driver_tag(hctx, rq);
}
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
......@@ -331,19 +343,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
if (bt->sb.depth == 1)
return true;
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return true;
users = atomic_read(&set->active_queues_shared_sbitmap);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
users = atomic_read(&hctx->tags->active_queues);
}
users = atomic_read(&hctx->tags->active_queues);
if (!users)
return true;
......
......@@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
* BIO_TRACKED lets controllers know that a bio went through the
* normal rq_qos path.
*/
bio_set_flag(bio, BIO_TRACKED);
if (q->rq_qos)
if (q->rq_qos) {
bio_set_flag(bio, BIO_TRACKED);
__rq_qos_throttle(q->rq_qos, bio);
}
}
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
......
......@@ -17,6 +17,7 @@
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-wbt.h"
#include "blk-throttle.h"
struct queue_sysfs_entry {
struct attribute attr;
......@@ -432,26 +433,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
static ssize_t queue_poll_store(struct request_queue *q, const char *page,
size_t count)
{
unsigned long poll_on;
ssize_t ret;
if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
!q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return -EINVAL;
ret = queue_var_store(&poll_on, page, count);
if (ret < 0)
return ret;
if (poll_on) {
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
} else {
blk_mq_freeze_queue(q);
blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
blk_mq_unfreeze_queue(q);
}
return ret;
pr_info_ratelimited("writes to the poll attribute are ignored.\n");
pr_info_ratelimited("please use driver specific parameters instead.\n");
return count;
}
static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
......@@ -887,16 +873,15 @@ int blk_register_queue(struct gendisk *disk)
}
mutex_lock(&q->sysfs_lock);
ret = disk_register_independent_access_ranges(disk, NULL);
if (ret)
goto put_dev;
if (q->elevator) {
ret = elv_register_queue(q, false);
if (ret) {
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
if (ret)
goto put_dev;
}
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
......@@ -927,6 +912,16 @@ int blk_register_queue(struct gendisk *disk)
percpu_ref_switch_to_percpu(&q->q_usage_counter);
}
return ret;
put_dev:
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
......@@ -972,6 +967,7 @@ void blk_unregister_queue(struct gendisk *disk)
mutex_lock(&q->sysfs_lock);
if (q->elevator)
elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
......
......@@ -13,6 +13,7 @@
#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-cgroup-rwstat.h"
#include "blk-throttle.h"
/* Max dispatch from a group in 1 round */
#define THROTL_GRP_QUANTUM 8
......@@ -37,60 +38,9 @@
*/
#define LATENCY_FILTERED_HD (1000L) /* 1ms */
static struct blkcg_policy blkcg_policy_throtl;
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
/*
* To implement hierarchical throttling, throtl_grps form a tree and bios
* are dispatched upwards level by level until they reach the top and get
* issued. When dispatching bios from the children and local group at each
* level, if the bios are dispatched into a single bio_list, there's a risk
* of a local or child group which can queue many bios at once filling up
* the list starving others.
*
* To avoid such starvation, dispatched bios are queued separately
* according to where they came from. When they are again dispatched to
* the parent, they're popped in round-robin order so that no single source
* hogs the dispatch window.
*
* throtl_qnode is used to keep the queued bios separated by their sources.
* Bios are queued to throtl_qnode which in turn is queued to
* throtl_service_queue and then dispatched in round-robin order.
*
* It's also used to track the reference counts on blkg's. A qnode always
* belongs to a throtl_grp and gets queued on itself or the parent, so
* incrementing the reference of the associated throtl_grp when a qnode is
* queued and decrementing when dequeued is enough to keep the whole blkg
* tree pinned while bios are in flight.
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
struct bio_list bios; /* queued bios */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
struct throtl_service_queue {
struct throtl_service_queue *parent_sq; /* the parent service_queue */
/*
* Bios queued directly to this service_queue or dispatched from
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued[2]; /* number of queued bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
struct rb_root_cached pending_tree; /* RB tree of active tgs */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
......@@ -98,93 +48,6 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
enum {
LIMIT_LOW,
LIMIT_MAX,
LIMIT_CNT,
};
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
/* active throtl group service_queue member */
struct rb_node rb_node;
/* throtl_data this group belongs to */
struct throtl_data *td;
/* this group's service queue */
struct throtl_service_queue service_queue;
/*
* qnode_on_self is used when bios are directly queued to this
* throtl_grp so that local bios compete fairly with bios
* dispatched from children. qnode_on_parent is used when bios are
* dispatched from this throtl_grp into its parent and will compete
* with the sibling qnode_on_parents and the parent's
* qnode_on_self.
*/
struct throtl_qnode qnode_on_self[2];
struct throtl_qnode qnode_on_parent[2];
/*
* Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as
* key to sort active groups in service tree.
*/
unsigned long disptime;
unsigned int flags;
/* are there any throtl rules between this group and td? */
bool has_rules[2];
/* internally used bytes per second rate limits */
uint64_t bps[2][LIMIT_CNT];
/* user configured bps limits */
uint64_t bps_conf[2][LIMIT_CNT];
/* internally used IOPS limits */
unsigned int iops[2][LIMIT_CNT];
/* user configured IOPS limits */
unsigned int iops_conf[2][LIMIT_CNT];
/* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
unsigned long last_low_overflow_time[2];
uint64_t last_bytes_disp[2];
unsigned int last_io_disp[2];
unsigned long last_check_time;
unsigned long latency_target; /* us */
unsigned long latency_target_conf; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
unsigned long last_finish_time; /* ns / 1024 */
unsigned long checked_last_finish_time; /* ns / 1024 */
unsigned long avg_idletime; /* ns / 1024 */
unsigned long idletime_threshold; /* us */
unsigned long idletime_threshold_conf; /* us */
unsigned int bio_cnt; /* total bios */
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
atomic_t io_split_cnt[2];
atomic_t last_io_split_cnt[2];
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
/* We measure latency for request size from <= 4k to >= 1M */
#define LATENCY_BUCKET_SIZE 9
......@@ -231,16 +94,6 @@ struct throtl_data
static void throtl_pending_timer_fn(struct timer_list *t);
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
{
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
{
return pd_to_blkg(&tg->pd);
......@@ -1794,7 +1647,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work);
}
static struct blkcg_policy blkcg_policy_throtl = {
struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,
......@@ -2208,9 +2061,9 @@ void blk_throtl_charge_bio_split(struct bio *bio)
} while (parent);
}
bool blk_throtl_bio(struct bio *bio)
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
struct throtl_grp *tg = blkg_to_tg(blkg);
......@@ -2221,19 +2074,12 @@ bool blk_throtl_bio(struct bio *bio)
rcu_read_lock();
/* see throtl_charge_bio() */
if (bio_flagged(bio, BIO_THROTTLED))
goto out;
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
bio->bi_iter.bi_size);
blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
}
if (!tg->has_rules[rw])
goto out;
spin_lock_irq(&q->queue_lock);
throtl_update_latency_buckets(td);
......@@ -2317,7 +2163,6 @@ bool blk_throtl_bio(struct bio *bio)
out_unlock:
spin_unlock_irq(&q->queue_lock);
out:
bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
......
#ifndef BLK_THROTTLE_H
#define BLK_THROTTLE_H
#include "blk-cgroup-rwstat.h"
/*
* To implement hierarchical throttling, throtl_grps form a tree and bios
* are dispatched upwards level by level until they reach the top and get
* issued. When dispatching bios from the children and local group at each
* level, if the bios are dispatched into a single bio_list, there's a risk
* of a local or child group which can queue many bios at once filling up
* the list starving others.
*
* To avoid such starvation, dispatched bios are queued separately
* according to where they came from. When they are again dispatched to
* the parent, they're popped in round-robin order so that no single source
* hogs the dispatch window.
*
* throtl_qnode is used to keep the queued bios separated by their sources.
* Bios are queued to throtl_qnode which in turn is queued to
* throtl_service_queue and then dispatched in round-robin order.
*
* It's also used to track the reference counts on blkg's. A qnode always
* belongs to a throtl_grp and gets queued on itself or the parent, so
* incrementing the reference of the associated throtl_grp when a qnode is
* queued and decrementing when dequeued is enough to keep the whole blkg
* tree pinned while bios are in flight.
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
struct bio_list bios; /* queued bios */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
struct throtl_service_queue {
struct throtl_service_queue *parent_sq; /* the parent service_queue */
/*
* Bios queued directly to this service_queue or dispatched from
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued[2]; /* number of queued bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
struct rb_root_cached pending_tree; /* RB tree of active tgs */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
enum {
LIMIT_LOW,
LIMIT_MAX,
LIMIT_CNT,
};
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
/* active throtl group service_queue member */
struct rb_node rb_node;
/* throtl_data this group belongs to */
struct throtl_data *td;
/* this group's service queue */
struct throtl_service_queue service_queue;
/*
* qnode_on_self is used when bios are directly queued to this
* throtl_grp so that local bios compete fairly with bios
* dispatched from children. qnode_on_parent is used when bios are
* dispatched from this throtl_grp into its parent and will compete
* with the sibling qnode_on_parents and the parent's
* qnode_on_self.
*/
struct throtl_qnode qnode_on_self[2];
struct throtl_qnode qnode_on_parent[2];
/*
* Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as
* key to sort active groups in service tree.
*/
unsigned long disptime;
unsigned int flags;
/* are there any throtl rules between this group and td? */
bool has_rules[2];
/* internally used bytes per second rate limits */
uint64_t bps[2][LIMIT_CNT];
/* user configured bps limits */
uint64_t bps_conf[2][LIMIT_CNT];
/* internally used IOPS limits */
unsigned int iops[2][LIMIT_CNT];
/* user configured IOPS limits */
unsigned int iops_conf[2][LIMIT_CNT];
/* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
unsigned long last_low_overflow_time[2];
uint64_t last_bytes_disp[2];
unsigned int last_io_disp[2];
unsigned long last_check_time;
unsigned long latency_target; /* us */
unsigned long latency_target_conf; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
unsigned long last_finish_time; /* ns / 1024 */
unsigned long checked_last_finish_time; /* ns / 1024 */
unsigned long avg_idletime; /* ns / 1024 */
unsigned long idletime_threshold; /* us */
unsigned long idletime_threshold_conf; /* us */
unsigned int bio_cnt; /* total bios */
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
atomic_t io_split_cnt[2];
atomic_t last_io_split_cnt[2];
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
extern struct blkcg_policy blkcg_policy_throtl;
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
{
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}
/*
* Internal throttling interface
*/
#ifndef CONFIG_BLK_DEV_THROTTLING
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#else /* CONFIG_BLK_DEV_THROTTLING */
int blk_throtl_init(struct request_queue *q);
void blk_throtl_exit(struct request_queue *q);
void blk_throtl_register_queue(struct request_queue *q);
void blk_throtl_charge_bio_split(struct bio *bio);
bool __blk_throtl_bio(struct bio *bio);
static inline bool blk_throtl_bio(struct bio *bio)
{
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
if (bio_flagged(bio, BIO_THROTTLED))
return false;
if (!tg->has_rules[bio_data_dir(bio)])
return false;
return __blk_throtl_bio(bio);
}
#endif /* CONFIG_BLK_DEV_THROTTLING */
#endif
......@@ -357,6 +357,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
unsigned int inflight = wbt_inflight(rwb);
int status;
if (!rwb->rqos.q->disk)
return;
status = latency_exceeded(rwb, cb->stat);
trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step,
......
This diff is collapsed.
......@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment