Commit 0a4b6e2f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:
 "This is the main pull request for block IO related changes for the
  4.16 kernel. Nothing major in this pull request, but a good amount of
  improvements and fixes all over the map. This contains:

   - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
     Paolo.

   - Support for SMR zones for deadline and mq-deadline from Damien and
     Christoph.

   - Set of fixes for bcache by way of Michael Lyle, including fixes
     from himself, Kent, Rui, Tang, and Coly.

   - Series from Matias for lightnvm with fixes from Hans Holmberg,
     Javier, and Matias. Mostly centered around pblk, and the removing
     rrpc 1.2 in preparation for supporting 2.0.

   - A couple of NVMe pull requests from Christoph. Nothing major in
     here, just fixes and cleanups, and support for command tracing from
     Johannes.

   - Support for blk-throttle for tracking reads and writes separately.
     From Joseph Qi. A few cleanups/fixes also for blk-throttle from
     Weiping.

   - Series from Mike Snitzer that enables dm to register its queue more
     logically, something that's alwways been problematic on dm since
     it's a stacked device.

   - Series from Ming cleaning up some of the bio accessor use, in
     preparation for supporting multipage bvecs.

   - Various fixes from Ming closing up holes around queue mapping and
     quiescing.

   - BSD partition fix from Richard Narron, fixing a problem where we
     can't mount newer (10/11) FreeBSD partitions.

   - Series from Tejun reworking blk-mq timeout handling. The previous
     scheme relied on atomic bits, but it had races where we would think
     a request had timed out if it to reused at the wrong time.

   - null_blk now supports faking timeouts, to enable us to better
     exercise and test that functionality separately. From me.

   - Kill the separate atomic poll bit in the request struct. After
     this, we don't use the atomic bits on blk-mq anymore at all. From
     me.

   - sgl_alloc/free helpers from Bart.

   - Heavily contended tag case scalability improvement from me.

   - Various little fixes and cleanups from Arnd, Bart, Corentin,
     Douglas, Eryu, Goldwyn, and myself"

* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
  block: remove smart1,2.h
  nvme: add tracepoint for nvme_complete_rq
  nvme: add tracepoint for nvme_setup_cmd
  nvme-pci: introduce RECONNECTING state to mark initializing procedure
  nvme-rdma: remove redundant boolean for inline_data
  nvme: don't free uuid pointer before printing it
  nvme-pci: Suspend queues after deleting them
  bsg: use pr_debug instead of hand crafted macros
  blk-mq-debugfs: don't allow write on attributes with seq_operations set
  nvme-pci: Fix queue double allocations
  block: Set BIO_TRACE_COMPLETION on new bio during split
  blk-throttle: use queue_is_rq_based
  block: Remove kblockd_schedule_delayed_work{,_on}()
  blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
  blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
  lib/scatterlist: Fix chaining support in sgl_alloc_order()
  blk-throttle: track read and write request individually
  block: add bdev_read_only() checks to common helpers
  block: fail op_is_write() requests to read-only partitions
  blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
  ...
parents 9697e9da 796baeee
......@@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
unsigned long flags;
int i;
spin_lock_irqsave(&bfqd->lock, flags);
if (!entity) /* root group */
return;
goto put_async_queues;
spin_lock_irqsave(&bfqd->lock, flags);
/*
* Empty all service_trees belonging to this group before
* deactivating the group itself.
......@@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
}
__bfq_deactivate_entity(entity, false);
put_async_queues:
bfq_put_async_queues(bfqd, bfqg);
spin_unlock_irqrestore(&bfqd->lock, flags);
......
This diff is collapsed.
......@@ -337,6 +337,11 @@ struct bfq_queue {
* last transition from idle to backlogged.
*/
unsigned long service_from_backlogged;
/*
* Cumulative service received from the @bfq_queue since its
* last transition to weight-raised state.
*/
unsigned long service_from_wr;
/*
* Value of wr start time when switching to soft rt
......@@ -344,6 +349,8 @@ struct bfq_queue {
unsigned long wr_start_at_switch_to_srt;
unsigned long split_time; /* time of last split */
unsigned long first_IO_time; /* time of first I/O for this queue */
};
/**
......@@ -627,6 +634,18 @@ struct bfq_data {
struct bfq_io_cq *bio_bic;
/* bfqq associated with the task issuing current bio for merging */
struct bfq_queue *bio_bfqq;
/*
* Cached sbitmap shift, used to compute depth limits in
* bfq_update_depths.
*/
unsigned int sb_shift;
/*
* Depth limits used in bfq_limit_depth (see comments on the
* function)
*/
unsigned int word_depths[2][2];
};
enum bfqq_state_flags {
......
......@@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
struct bfq_entity *entity = &bfqq->entity;
struct bfq_service_tree *st;
if (!bfqq->service_from_backlogged)
bfqq->first_IO_time = jiffies;
if (bfqq->wr_coeff > 1)
bfqq->service_from_wr += served;
bfqq->service_from_backlogged += served;
for_each_entity(entity) {
st = bfq_entity_service_tree(entity);
......
......@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
/**
* __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
* @error: Pointer to errno
*
* Description: Completion for integrity I/O
*
......
......@@ -970,34 +970,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
}
EXPORT_SYMBOL(bio_advance);
/**
* bio_alloc_pages - allocates a single page for each bvec in a bio
* @bio: bio to allocate pages for
* @gfp_mask: flags for allocation
*
* Allocates pages up to @bio->bi_vcnt.
*
* Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
* freed.
*/
int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
{
int i;
struct bio_vec *bv;
bio_for_each_segment_all(bv, bio, i) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
__free_page(bv->bv_page);
return -ENOMEM;
}
}
return 0;
}
EXPORT_SYMBOL(bio_alloc_pages);
/**
* bio_copy_data - copy contents of data buffers from one chain of bios to
* another
......@@ -1838,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_advance(bio, split->bi_iter.bi_size);
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
bio_set_flag(bio, BIO_TRACE_COMPLETION);
bio_set_flag(split, BIO_TRACE_COMPLETION);
return split;
}
......
......@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
seqcount_init(&rq->gstate_seq);
u64_stats_init(&rq->aborted_gstate_sync);
}
EXPORT_SYMBOL(blk_rq_init);
......@@ -699,6 +701,15 @@ void blk_cleanup_queue(struct request_queue *q)
queue_flag_set(QUEUE_FLAG_DEAD, q);
spin_unlock_irq(lock);
/*
* make sure all in-progress dispatch are completed because
* blk_freeze_queue() can only complete all requests, and
* dispatch may still be in-progress since we dispatch requests
* from more than one contexts
*/
if (q->mq_ops)
blk_mq_quiesce_queue(q);
/* for synchronous bio-based driver finish in-flight integrity i/o */
blk_flush_integrity();
......@@ -1646,6 +1657,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
lockdep_assert_held(q->queue_lock);
blk_req_zone_write_unlock(req);
blk_pm_put_request(req);
elv_completed_request(q, req);
......@@ -2055,6 +2067,21 @@ static inline bool should_fail_request(struct hd_struct *part,
#endif /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
{
if (part->policy && op_is_write(bio_op(bio))) {
char b[BDEVNAME_SIZE];
printk(KERN_ERR
"generic_make_request: Trying to write "
"to read-only block-device %s (partno %d)\n",
bio_devname(bio, b), part->partno);
return true;
}
return false;
}
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
......@@ -2063,27 +2090,28 @@ static inline int blk_partition_remap(struct bio *bio)
struct hd_struct *p;
int ret = 0;
rcu_read_lock();
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
bio_check_ro(bio, p))) {
ret = -EIO;
goto out;
}
/*
* Zone reset does not include bi_size so bio_sectors() is always 0.
* Include a test for the reset op code and perform the remap if needed.
*/
if (!bio->bi_partno ||
(!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
return 0;
if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
goto out;
rcu_read_lock();
p = __disk_get_part(bio->bi_disk, bio->bi_partno);
if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
bio->bi_iter.bi_sector += p->start_sect;
bio->bi_partno = 0;
trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
bio->bi_iter.bi_sector - p->start_sect);
} else {
printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
ret = -EIO;
}
rcu_read_unlock();
bio->bi_iter.bi_sector += p->start_sect;
bio->bi_partno = 0;
trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
bio->bi_iter.bi_sector - p->start_sect);
out:
rcu_read_unlock();
return ret;
}
......@@ -2142,15 +2170,19 @@ generic_make_request_checks(struct bio *bio)
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue.
*/
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
goto not_supported;
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
goto end_io;
if (blk_partition_remap(bio))
goto end_io;
if (!bio->bi_partno) {
if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
goto end_io;
} else {
if (blk_partition_remap(bio))
goto end_io;
}
if (bio_check_eod(bio, nr_sectors))
goto end_io;
......@@ -2493,8 +2525,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
blk_mq_request_bypass_insert(rq, true);
return BLK_STS_OK;
return blk_mq_request_issue_directly(rq);
}
spin_lock_irqsave(q->queue_lock, flags);
......@@ -2846,7 +2877,7 @@ void blk_start_request(struct request *req)
wbt_issue(req->q->rq_wb, &req->issue_stat);
}
BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
BUG_ON(blk_rq_is_complete(req));
blk_add_timer(req);
}
EXPORT_SYMBOL(blk_start_request);
......@@ -3415,20 +3446,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
int kblockd_schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work(kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_schedule_delayed_work);
int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
/**
* blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized
......
......@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* be reused after dying flag is set
*/
if (q->mq_ops) {
blk_mq_sched_insert_request(rq, at_head, true, false, false);
blk_mq_sched_insert_request(rq, at_head, true, false);
return;
}
......
......@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP;
......@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
......@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
......@@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
while (nr_sects != 0) {
bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
gfp_mask);
......
......@@ -119,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
int ret;
int ret = -EINVAL;
if (!iter_is_iovec(iter))
goto fail;
......@@ -148,7 +148,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
__blk_rq_unmap_user(bio);
fail:
rq->bio = NULL;
return -EINVAL;
return ret;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
......
......@@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
nsegs++;
sectors = max_sectors;
}
if (sectors)
goto split;
/* Make this single bvec as the 1st segment */
goto split;
}
if (bvprvp && blk_queue_cluster(q)) {
......@@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bvprvp = &bvprv;
sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
continue;
}
new_segment:
if (nsegs == queue_max_segments(q))
goto split;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
nsegs++;
bvprv = bv;
bvprvp = &bvprv;
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
}
do_split = false;
......@@ -174,6 +171,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
bio = new;
}
if (nsegs == 1 && seg_size > front_seg_size)
front_seg_size = seg_size;
bio->bi_seg_front_size = front_seg_size;
if (seg_size > bio->bi_seg_back_size)
bio->bi_seg_back_size = seg_size;
......
......@@ -289,17 +289,12 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(MQ_TIMEOUT_EXPIRED),
RQF_NAME(MQ_POLL_SLEPT),
};
#undef RQF_NAME
#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
static const char *const rqaf_name[] = {
RQAF_NAME(COMPLETE),
RQAF_NAME(STARTED),
RQAF_NAME(POLL_SLEPT),
};
#undef RQAF_NAME
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
......@@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
seq_puts(m, ", .rq_flags=");
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name));
seq_puts(m, ", .atomic_flags=");
blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
rq->internal_tag);
if (mq_ops->show_rq)
......@@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
const struct show_busy_params *params = data;
if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
blk_mq_rq_state(rq) != MQ_RQ_IDLE)
__blk_mq_debugfs_rq_show(params->m,
list_entry_rq(&rq->queuelist));
}
......@@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
const struct blk_mq_debugfs_attr *attr = m->private;
void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
if (!attr->write)
/*
* Attributes that only implement .seq_ops are read-only and 'attr' is
* the same with 'data' in this case.
*/
if (attr == data || !attr->write)
return -EPERM;
return attr->write(data, buf, count, ppos);
......
......@@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
WRITE_ONCE(hctx->dispatch_from, ctx);
}
/* return true if hw queue need to be run again */
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
......@@ -428,7 +427,7 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
}
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block)
bool run_queue, bool async)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
......
......@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block);
bool run_queue, bool async);
void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_ctx *ctx,
struct list_head *list, bool run_queue_async);
......
......@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret;
}
static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
......@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
q->mq_sysfs_init_done = false;
}
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
mutex_lock(&q->sysfs_lock);
__blk_mq_unregister_dev(dev, q);
mutex_unlock(&q->sysfs_lock);
}
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
......
......@@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx);
drop_ctx = data->ctx == NULL;
do {
prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
break;
/*
* We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for
......@@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != -1)
break;
prepare_to_wait_exclusive(&ws->wait, &wait,
TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
break;
if (data->ctx)
blk_mq_put_ctx(data->ctx);
......
This diff is collapsed.
......@@ -27,6 +27,20 @@ struct blk_mq_ctx {
struct kobject kobj;
} ____cacheline_aligned_in_smp;
/*
* Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
* and the upper bits the generation number.
*/
enum mq_rq_state {
MQ_RQ_IDLE = 0,
MQ_RQ_IN_FLIGHT = 1,
MQ_RQ_COMPLETE = 2,
MQ_RQ_STATE_BITS = 2,
MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
};
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
......@@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq);
/*
* CPU -> queue mappings
*/
......@@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
void blk_mq_release(struct request_queue *q);
/**
* blk_mq_rq_state() - read the current MQ_RQ_* state of a request
* @rq: target request.
*/
static inline int blk_mq_rq_state(struct request *rq)
{
return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
}
/**
* blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
* @rq: target request.
* @state: new state to set.
*
* Set @rq's state to @state. The caller is responsible for ensuring that
* there are no other updaters. A request can transition into IN_FLIGHT
* only from IDLE and doing so increments the generation number.
*/
static inline void blk_mq_rq_update_state(struct request *rq,
enum mq_rq_state state)
{
u64 old_val = READ_ONCE(rq->gstate);
u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
if (state == MQ_RQ_IN_FLIGHT) {
WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
new_val += MQ_RQ_GEN_INC;
}
/* avoid exposing interim values */
WRITE_ONCE(rq->gstate, new_val);
}
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
......
......@@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
/**
* blk_register_queue - register a block layer queue with sysfs
* @disk: Disk of which the request queue should be registered with sysfs.
*/
int blk_register_queue(struct gendisk *disk)
{
int ret;
......@@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q);
if (ret) {
mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
goto unlock;
return ret;
}
}
ret = 0;
......@@ -921,7 +926,15 @@ int blk_register_queue(struct gendisk *disk)
mutex_unlock(&q->sysfs_lock);
return ret;
}
EXPORT_SYMBOL_GPL(blk_register_queue);
/**
* blk_unregister_queue - counterpart of blk_register_queue()
* @disk: Disk of which the request queue should be unregistered from sysfs.
*
* Note: the caller is responsible for guaranteeing that this function is called
* after blk_register_queue() has finished.
*/
void blk_unregister_queue(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
......@@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q))
return;
mutex_lock(&q->sysfs_lock);
queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
mutex_unlock(&q->sysfs_lock);
/* Return early if disk->queue was never registered. */
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
return;
wbt_exit(q);
/*
* Since sysfs_remove_dir() prevents adding new directory entries
* before removal of existing entries starts, protect against
* concurrent elv_iosched_store() calls.
*/
mutex_lock(&q->sysfs_lock);
spin_lock_irq(q->queue_lock);
queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
spin_unlock_irq(q->queue_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
*/
if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q);
if (q->request_fn || (q->mq_ops && q->elevator))
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
wbt_exit(q);
mutex_lock(&q->sysfs_lock);
if (q->request_fn || (q->mq_ops && q->elevator))
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_lock);
kobject_put(&disk_to_dev(disk)->kobj);
}
......@@ -216,9 +216,9 @@ struct throtl_data
unsigned int scale;
struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
struct latency_bucket __percpu *latency_buckets;
struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
struct latency_bucket __percpu *latency_buckets[2];
unsigned long last_calculate_time;
unsigned long filtered_latency;
......@@ -1510,11 +1510,21 @@ static struct cftype throtl_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_bytes,
},
{
.name = "throttle.io_service_bytes_recursive",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_bytes_recursive,
},
{
.name = "throttle.io_serviced",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios,
},
{
.name = "throttle.io_serviced_recursive",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios_recursive,
},
{ } /* terminate */
};
......@@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td)
{
struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
int i, cpu;
unsigned long last_latency = 0;
unsigned long latency;
struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
int i, cpu, rw;
unsigned long last_latency[2] = { 0 };
unsigned long latency[2];
if (!blk_queue_nonrot(td->queue))
return;
......@@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
td->last_calculate_time = jiffies;
memset(avg_latency, 0, sizeof(avg_latency));
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
struct latency_bucket *tmp = &td->tmp_buckets[i];
for_each_possible_cpu(cpu) {
struct latency_bucket *bucket;
/* this isn't race free, but ok in practice */
bucket = per_cpu_ptr(td->latency_buckets, cpu);
tmp->total_latency += bucket[i].total_latency;
tmp->samples += bucket[i].samples;
bucket[i].total_latency = 0;
bucket[i].samples = 0;
}
for (rw = READ; rw <= WRITE; rw++) {
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
for_each_possible_cpu(cpu) {
struct latency_bucket *bucket;
/* this isn't race free, but ok in practice */
bucket = per_cpu_ptr(td->latency_buckets[rw],
cpu);
tmp->total_latency += bucket[i].total_latency;
tmp->samples += bucket[i].samples;
bucket[i].total_latency = 0;
bucket[i].samples = 0;
}
if (tmp->samples >= 32) {
int samples = tmp->samples;
if (tmp->samples >= 32) {
int samples = tmp->samples;
latency = tmp->total_latency;
latency[rw] = tmp->total_latency;
tmp->total_latency = 0;
tmp->samples = 0;
latency /= samples;
if (latency == 0)
continue;
avg_latency[i].latency = latency;
tmp->total_latency = 0;
tmp->samples = 0;
latency[rw] /= samples;
if (latency[rw] == 0)
continue;
avg_latency[rw][i].latency = latency[rw];
}
}
}
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
if (!avg_latency[i].latency) {
if (td->avg_buckets[i].latency < last_latency)
td->avg_buckets[i].latency = last_latency;
continue;
}
for (rw = READ; rw <= WRITE; rw++) {
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
if (!avg_latency[rw][i].latency) {
if (td->avg_buckets[rw][i].latency < last_latency[rw])
td->avg_buckets[rw][i].latency =
last_latency[rw];
continue;
}
if (!td->avg_buckets[i].valid)
latency = avg_latency[i].latency;
else
latency = (td->avg_buckets[i].latency * 7 +
avg_latency[i].latency) >> 3;
if (!td->avg_buckets[rw][i].valid)
latency[rw] = avg_latency[rw][i].latency;
else
latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
avg_latency[rw][i].latency) >> 3;
td->avg_buckets[i].latency = max(latency, last_latency);
td->avg_buckets[i].valid = true;
last_latency = td->avg_buckets[i].latency;
td->avg_buckets[rw][i].latency = max(latency[rw],
last_latency[rw]);
td->avg_buckets[rw][i].valid = true;
last_latency[rw] = td->avg_buckets[rw][i].latency;
}
}
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
throtl_log(&td->service_queue,
"Latency bucket %d: latency=%ld, valid=%d", i,
td->avg_buckets[i].latency, td->avg_buckets[i].valid);
"Latency bucket %d: read latency=%ld, read valid=%d, "
"write latency=%ld, write valid=%d", i,
td->avg_buckets[READ][i].latency,
td->avg_buckets[READ][i].valid,
td->avg_buckets[WRITE][i].latency,
td->avg_buckets[WRITE][i].valid);
}
#else
static inline void throtl_update_latency_buckets(struct throtl_data *td)
......@@ -2242,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
struct latency_bucket *latency;
int index;
if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
if (!td || td->limit_index != LIMIT_LOW ||
!(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
!blk_queue_nonrot(td->queue))
return;
index = request_bucket_index(size);
latency = get_cpu_ptr(td->latency_buckets);
latency = get_cpu_ptr(td->latency_buckets[op]);
latency[index].total_latency += time;
latency[index].samples++;
put_cpu_ptr(td->latency_buckets);
put_cpu_ptr(td->latency_buckets[op]);
}
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
......@@ -2270,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)
unsigned long finish_time;
unsigned long start_time;
unsigned long lat;
int rw = bio_data_dir(bio);
tg = bio->bi_cg_private;
if (!tg)
......@@ -2298,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)
bucket = request_bucket_index(
blk_stat_size(&bio->bi_issue_stat));
threshold = tg->td->avg_buckets[bucket].latency +
threshold = tg->td->avg_buckets[rw][bucket].latency +
tg->latency_target;
if (lat > threshold)
tg->bad_bio_cnt++;
......@@ -2391,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64));
if (!td->latency_buckets) {
if (!td->latency_buckets[READ]) {
kfree(td);
return -ENOMEM;
}
td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64));
if (!td->latency_buckets[WRITE]) {
free_percpu(td->latency_buckets[READ]);
kfree(td);
return -ENOMEM;
}
......@@ -2412,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
if (ret) {
free_percpu(td->latency_buckets);
free_percpu(td->latency_buckets[READ]);
free_percpu(td->latency_buckets[WRITE]);
kfree(td);
}
return ret;
......@@ -2423,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
free_percpu(q->td->latency_buckets);
free_percpu(q->td->latency_buckets[READ]);
free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
......@@ -2441,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)
} else {
td->throtl_slice = DFL_THROTL_SLICE_HD;
td->filtered_latency = LATENCY_FILTERED_HD;
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
}
}
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */
td->throtl_slice = DFL_THROTL_SLICE_HD;
#endif
td->track_bio_latency = !q->mq_ops && !q->request_fn;
td->track_bio_latency = !queue_is_rq_based(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
}
......
......@@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
unsigned int *next_set)
{
if (time_after_eq(jiffies, rq->deadline)) {
const unsigned long deadline = blk_rq_deadline(rq);
if (time_after_eq(jiffies, deadline)) {
list_del_init(&rq->timeout_list);
/*
......@@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
*/
if (!blk_mark_rq_complete(rq))
blk_rq_timed_out(rq);
} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
*next_timeout = rq->deadline;
} else if (!*next_set || time_after(*next_timeout, deadline)) {
*next_timeout = deadline;
*next_set = 1;
}
}
......@@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work)
*/
void blk_abort_request(struct request *req)
{
if (blk_mark_rq_complete(req))
return;
if (req->q->mq_ops) {
blk_mq_rq_timed_out(req, false);
/*
* All we need to ensure is that timeout scan takes place
* immediately and that scan sees the new timeout value.
* No need for fancy synchronizations.
*/
blk_rq_set_deadline(req, jiffies);
mod_timer(&req->q->timeout, 0);
} else {
if (blk_mark_rq_complete(req))
return;
blk_delete_timer(req);
blk_rq_timed_out(req);
}
......@@ -208,7 +215,8 @@ void blk_add_timer(struct request *req)
if (!req->timeout)
req->timeout = q->rq_timeout;
WRITE_ONCE(req->deadline, jiffies + req->timeout);
blk_rq_set_deadline(req, jiffies + req->timeout);
req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
/*
* Only the non-mq case needs to add the request to a protected list.
......@@ -222,7 +230,7 @@ void blk_add_timer(struct request *req)
* than an existing one, modify the timer. Round up to next nearest
* second.
*/
expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) {
......
......@@ -21,6 +21,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,
return sector & ~zone_mask;
}
/*
* Return true if a request is a write requests that needs zone write locking.
*/
bool blk_req_needs_zone_write_lock(struct request *rq)
{
if (!rq->q->seq_zones_wlock)
return false;
if (blk_rq_is_passthrough(rq))
return false;
switch (req_op(rq)) {
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_WRITE:
return blk_rq_zone_is_seq(rq);
default:
return false;
}
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
void __blk_req_zone_write_lock(struct request *rq)
{
if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
rq->q->seq_zones_wlock)))
return;
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
void __blk_req_zone_write_unlock(struct request *rq)
{
rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
if (rq->q->seq_zones_wlock)
WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
rq->q->seq_zones_wlock));
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
/*
* Check that a zone report belongs to the partition.
* If yes, fix its start sector and write pointer, copy it in the
......
......@@ -119,34 +119,24 @@ void blk_account_io_start(struct request *req, bool new_io);
void blk_account_io_completion(struct request *req, unsigned int bytes);
void blk_account_io_done(struct request *req);
/*
* Internal atomic flags for request handling
*/
enum rq_atomic_flags {
/*
* Keep these two bits first - not because we depend on the
* value of them, but we do depend on them being in the same
* byte of storage to ensure ordering on writes. Keeping them
* first will achieve that nicely.
*/
REQ_ATOM_COMPLETE = 0,
REQ_ATOM_STARTED,
REQ_ATOM_POLL_SLEPT,
};
/*
* EH timer and IO completion will both attempt to 'grab' the request, make
* sure that only one of them succeeds
* sure that only one of them succeeds. Steal the bottom bit of the
* __deadline field for this.
*/
static inline int blk_mark_rq_complete(struct request *rq)
{
return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
return test_and_set_bit(0, &rq->__deadline);
}
static inline void blk_clear_rq_complete(struct request *rq)
{
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
clear_bit(0, &rq->__deadline);
}
static inline bool blk_rq_is_complete(struct request *rq)
{
return test_bit(0, &rq->__deadline);
}
/*
......@@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
}
int elv_register_queue(struct request_queue *q);
void elv_unregister_queue(struct request_queue *q);
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
#ifdef CONFIG_FAIL_IO_TIMEOUT
......@@ -245,6 +238,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
q->last_merge = NULL;
}
/*
* Steal a bit from this field for legacy IO path atomic IO marking. Note that
* setting the deadline clears the bottom bit, potentially clearing the
* completed bit. The user has to be OK with this (current ones are fine).
*/
static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
{
rq->__deadline = time & ~0x1UL;
}
static inline unsigned long blk_rq_deadline(struct request *rq)
{
return rq->__deadline & ~0x1UL;
}
/*
* Internal io_context interface
*/
......
......@@ -113,45 +113,50 @@ int init_emergency_isa_pool(void)
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
unsigned char *vfrom;
struct bio_vec tovec, *fromvec = from->bi_io_vec;
struct bio_vec tovec, fromvec;
struct bvec_iter iter;
/*
* The bio of @from is created by bounce, so we can iterate
* its bvec from start to end, but the @from->bi_iter can't be
* trusted because it might be changed by splitting.
*/
struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
bio_for_each_segment(tovec, to, iter) {
if (tovec.bv_page != fromvec->bv_page) {
fromvec = bio_iter_iovec(from, from_iter);
if (tovec.bv_page != fromvec.bv_page) {
/*
* fromvec->bv_offset and fromvec->bv_len might have
* been modified by the block layer, so use the original
* copy, bounce_copy_vec already uses tovec->bv_len
*/
vfrom = page_address(fromvec->bv_page) +
vfrom = page_address(fromvec.bv_page) +
tovec.bv_offset;
bounce_copy_vec(&tovec, vfrom);
flush_dcache_page(tovec.bv_page);
}
fromvec++;
bio_advance_iter(from, &from_iter, tovec.bv_len);
}
}
static void bounce_end_io(struct bio *bio, mempool_t *pool)
{
struct bio *bio_orig = bio->bi_private;
struct bio_vec *bvec, *org_vec;
struct bio_vec *bvec, orig_vec;
int i;
int start = bio_orig->bi_iter.bi_idx;
struct bvec_iter orig_iter = bio_orig->bi_iter;
/*
* free up bounce indirect pages used
*/
bio_for_each_segment_all(bvec, bio, i) {
org_vec = bio_orig->bi_io_vec + i + start;
if (bvec->bv_page == org_vec->bv_page)
continue;
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
mempool_free(bvec->bv_page, pool);
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
if (bvec->bv_page != orig_vec.bv_page) {
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
mempool_free(bvec->bv_page, pool);
}
bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
}
bio_orig->bi_status = bio->bi_status;
......
......@@ -30,7 +30,7 @@
/**
* bsg_teardown_job - routine to teardown a bsg job
* @job: bsg_job that is to be torn down
* @kref: kref inside bsg_job that is to be torn down
*/
static void bsg_teardown_job(struct kref *kref)
{
......@@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
* @name: device to give bsg device
* @job_fn: bsg job handler
* @dd_job_size: size of LLD data needed for each job
* @release: @dev release function
*/
struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
bsg_job_fn *job_fn, int dd_job_size,
......
......@@ -32,6 +32,9 @@
#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver"
#define BSG_VERSION "0.4"
#define bsg_dbg(bd, fmt, ...) \
pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__)
struct bsg_device {
struct request_queue *queue;
spinlock_t lock;
......@@ -55,14 +58,6 @@ enum {
#define BSG_DEFAULT_CMDS 64
#define BSG_MAX_DEVS 32768
#undef BSG_DEBUG
#ifdef BSG_DEBUG
#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args)
#else
#define dprintk(fmt, args...)
#endif
static DEFINE_MUTEX(bsg_mutex);
static DEFINE_IDR(bsg_minor_idr);
......@@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
bc->bd = bd;
INIT_LIST_HEAD(&bc->list);
dprintk("%s: returning free cmd %p\n", bd->name, bc);
bsg_dbg(bd, "returning free cmd %p\n", bc);
return bc;
out:
spin_unlock_irq(&bd->lock);
......@@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
if (!bcd->class_dev)
return ERR_PTR(-ENXIO);
dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n",
(unsigned long long) hdr->dout_xferp,
hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
hdr->din_xfer_len);
......@@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status)
struct bsg_device *bd = bc->bd;
unsigned long flags;
dprintk("%s: finished rq %p bc %p, bio %p\n",
bd->name, rq, bc, bc->bio);
bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
rq, bc, bc->bio);
bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
......@@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
list_add_tail(&bc->list, &bd->busy_list);
spin_unlock_irq(&bd->lock);
dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
rq->end_io_data = bc;
blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
......@@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
}
} while (1);
dprintk("%s: returning done %p\n", bd->name, bc);
bsg_dbg(bd, "returning done %p\n", bc);
return bc;
}
......@@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
struct scsi_request *req = scsi_req(rq);
int ret = 0;
dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);
/*
* fill in all the output members
*/
......@@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
struct bsg_command *bc;
int ret, tret;
dprintk("%s: entered\n", bd->name);
bsg_dbg(bd, "entered\n");
/*
* wait for all commands to complete
......@@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
int ret;
ssize_t bytes_read;
dprintk("%s: read %zd bytes\n", bd->name, count);
bsg_dbg(bd, "read %zd bytes\n", count);
bsg_set_block(bd, file);
......@@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
ssize_t bytes_written;
int ret;
dprintk("%s: write %zd bytes\n", bd->name, count);
bsg_dbg(bd, "write %zd bytes\n", count);
if (unlikely(uaccess_kernel()))
return -EINVAL;
......@@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
if (!bytes_written || err_block_err(ret))
bytes_written = ret;
dprintk("%s: returning %zd\n", bd->name, bytes_written);
bsg_dbg(bd, "returning %zd\n", bytes_written);
return bytes_written;
}
......@@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd)
hlist_del(&bd->dev_list);
mutex_unlock(&bsg_mutex);
dprintk("%s: tearing down\n", bd->name);
bsg_dbg(bd, "tearing down\n");
/*
* close can always block
......@@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
struct file *file)
{
struct bsg_device *bd;
#ifdef BSG_DEBUG
unsigned char buf[32];
#endif
if (!blk_queue_scsi_passthrough(rq)) {
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
......@@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
dprintk("bound to <%s>, max queue %d\n",
bsg_dbg(bd, "bound to <%s>, max queue %d\n",
format_dev_t(buf, inode->i_rdev), bd->max_queue);
mutex_unlock(&bsg_mutex);
......
......@@ -50,8 +50,6 @@ struct deadline_data {
int front_merges;
};
static void deadline_move_request(struct deadline_data *, struct request *);
static inline struct rb_root *
deadline_rb_root(struct deadline_data *dd, struct request *rq)
{
......@@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq)
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
/*
* This may be a requeue of a write request that has locked its
* target zone. If it is the case, this releases the zone lock.
*/
blk_req_zone_write_unlock(rq);
deadline_add_rq_rb(dd, rq);
/*
......@@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
{
struct request_queue *q = rq->q;
/*
* For a zoned block device, write requests must write lock their
* target zone.
*/
blk_req_zone_write_lock(rq);
deadline_remove_request(q, rq);
elv_dispatch_add_tail(q, rq);
}
......@@ -230,6 +240,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
return 0;
}
/*
* For the specified data direction, return the next request to dispatch using
* arrival ordered lists.
*/
static struct request *
deadline_fifo_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
if (list_empty(&dd->fifo_list[data_dir]))
return NULL;
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
if (blk_req_can_dispatch_to_zone(rq))
return rq;
}
return NULL;
}
/*
* For the specified data direction, return the next request to dispatch using
* sector position sorted lists.
*/
static struct request *
deadline_next_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
rq = dd->next_rq[data_dir];
if (!rq)
return NULL;
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
return rq;
rq = deadline_latter_request(rq);
}
return NULL;
}
/*
* deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc
......@@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(&dd->fifo_list[READ]);
const int writes = !list_empty(&dd->fifo_list[WRITE]);
struct request *rq;
struct request *rq, *next_rq;
int data_dir;
/*
* batches are currently reads XOR writes
*/
if (dd->next_rq[WRITE])
rq = dd->next_rq[WRITE];
else
rq = dd->next_rq[READ];
rq = deadline_next_request(dd, WRITE);
if (!rq)
rq = deadline_next_request(dd, READ);
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
......@@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
if (writes && (dd->starved++ >= dd->writes_starved))
if (deadline_fifo_request(dd, WRITE) &&
(dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
data_dir = READ;
......@@ -291,21 +364,29 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
/*
* we are not running a batch, find best request for selected data_dir
*/
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
next_rq = deadline_next_request(dd, data_dir);
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
rq = deadline_fifo_request(dd, data_dir);
} else {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
rq = dd->next_rq[data_dir];
rq = next_rq;
}
/*
* For a zoned block device, if we only have writes queued and none of
* them can be dispatched, rq will be NULL.
*/
if (!rq)
return 0;
dd->batching = 0;
dispatch_request:
......@@ -318,6 +399,16 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
return 1;
}
/*
* For zoned block devices, write unlock the target zone of completed
* write requests.
*/
static void
deadline_completed_request(struct request_queue *q, struct request *rq)
{
blk_req_zone_write_unlock(rq);
}
static void deadline_exit_queue(struct elevator_queue *e)
{
struct deadline_data *dd = e->elevator_data;
......@@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = {
.elevator_merged_fn = deadline_merged_request,
.elevator_merge_req_fn = deadline_merged_requests,
.elevator_dispatch_fn = deadline_dispatch_requests,
.elevator_completed_req_fn = deadline_completed_request,
.elevator_add_req_fn = deadline_add_request,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
......
......@@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
struct elevator_queue *e = q->elevator;
int error;
lockdep_assert_held(&q->sysfs_lock);
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
......@@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
}
return error;
}
EXPORT_SYMBOL(elv_register_queue);
void elv_unregister_queue(struct request_queue *q)
{
lockdep_assert_held(&q->sysfs_lock);
if (q) {
struct elevator_queue *e = q->elevator;
......@@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
wbt_enable_default(q);
}
}
EXPORT_SYMBOL(elv_unregister_queue);
int elv_register(struct elevator_type *e)
{
......@@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q,
{
int ret;
lockdep_assert_held(&q->sysfs_lock);
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
if (q->elevator->registered)
......@@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q,
blk_add_trace_msg(q, "elv switch: none");
out:
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return ret;
}
......@@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
bool old_registered = false;
int err;
lockdep_assert_held(&q->sysfs_lock);
if (q->mq_ops)
return elevator_switch_mq(q, new_e);
......
......@@ -629,16 +629,18 @@ static void register_disk(struct device *parent, struct gendisk *disk)
}
/**
* device_add_disk - add partitioning information to kernel list
* __device_add_disk - add disk information to kernel list
* @parent: parent device for the disk
* @disk: per-device partitioning information
* @register_queue: register the queue if set to true
*
* This function registers the partitioning information in @disk
* with the kernel.
*
* FIXME: error handling
*/
void device_add_disk(struct device *parent, struct gendisk *disk)
static void __device_add_disk(struct device *parent, struct gendisk *disk,
bool register_queue)
{
dev_t devt;
int retval;
......@@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
exact_match, exact_lock, disk);
}
register_disk(parent, disk);
blk_register_queue(disk);
if (register_queue)
blk_register_queue(disk);
/*
* Take an extra ref on queue which will be put on disk_release()
......@@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
disk_add_events(disk);
blk_integrity_add(disk);
}
void device_add_disk(struct device *parent, struct gendisk *disk)
{
__device_add_disk(parent, disk, true);
}
EXPORT_SYMBOL(device_add_disk);
void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
{
__device_add_disk(parent, disk, false);
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);
void del_gendisk(struct gendisk *disk)
{
struct disk_part_iter piter;
......@@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)
* Unregister bdi before releasing device numbers (as they can
* get reused and we'd get clashes in sysfs).
*/
bdi_unregister(disk->queue->backing_dev_info);
if (!(disk->flags & GENHD_FL_HIDDEN))
bdi_unregister(disk->queue->backing_dev_info);
blk_unregister_queue(disk);
} else {
WARN_ON(1);
......
......@@ -59,6 +59,7 @@ struct deadline_data {
int front_merges;
spinlock_t lock;
spinlock_t zone_lock;
struct list_head dispatch;
};
......@@ -191,14 +192,84 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
return 0;
}
/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
*/
static struct request *
deadline_fifo_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
unsigned long flags;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
if (list_empty(&dd->fifo_list[data_dir]))
return NULL;
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
if (blk_req_can_dispatch_to_zone(rq))
goto out;
}
rq = NULL;
out:
spin_unlock_irqrestore(&dd->zone_lock, flags);
return rq;
}
/*
* For the specified data direction, return the next request to
* dispatch using sector position sorted lists.
*/
static struct request *
deadline_next_request(struct deadline_data *dd, int data_dir)
{
struct request *rq;
unsigned long flags;
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
rq = dd->next_rq[data_dir];
if (!rq)
return NULL;
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
break;
rq = deadline_latter_request(rq);
}
spin_unlock_irqrestore(&dd->zone_lock, flags);
return rq;
}
/*
* deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc
*/
static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
static struct request *__dd_dispatch_request(struct deadline_data *dd)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
struct request *rq;
struct request *rq, *next_rq;
bool reads, writes;
int data_dir;
......@@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
/*
* batches are currently reads XOR writes
*/
if (dd->next_rq[WRITE])
rq = dd->next_rq[WRITE];
else
rq = dd->next_rq[READ];
rq = deadline_next_request(dd, WRITE);
if (!rq)
rq = deadline_next_request(dd, READ);
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
......@@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
if (writes && (dd->starved++ >= dd->writes_starved))
if (deadline_fifo_request(dd, WRITE) &&
(dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
data_dir = READ;
......@@ -260,21 +331,29 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
/*
* we are not running a batch, find best request for selected data_dir
*/
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
next_rq = deadline_next_request(dd, data_dir);
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
rq = deadline_fifo_request(dd, data_dir);
} else {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
rq = dd->next_rq[data_dir];
rq = next_rq;
}
/*
* For a zoned block device, if we only have writes queued and none of
* them can be dispatched, rq will be NULL.
*/
if (!rq)
return NULL;
dd->batching = 0;
dispatch_request:
......@@ -284,17 +363,27 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
dd->batching++;
deadline_move_request(dd, rq);
done:
/*
* If the request needs its target zone locked, do it.
*/
blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
/*
* One confusing aspect here is that we get called for a specific
* hardware queue, but we return a request that may not be for a
* different hardware queue. This is because mq-deadline has shared
* state for all hardware queues, in terms of sorting, FIFOs, etc.
*/
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
struct request *rq;
spin_lock(&dd->lock);
rq = __dd_dispatch_request(hctx);
rq = __dd_dispatch_request(dd);
spin_unlock(&dd->lock);
return rq;
......@@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
INIT_LIST_HEAD(&dd->dispatch);
q->elevator = eq;
......@@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
/*
* This may be a requeue of a write request that has locked its
* target zone. If it is the case, this releases the zone lock.
*/
blk_req_zone_write_unlock(rq);
if (blk_mq_sched_try_insert_merge(q, rq))
return;
......@@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
spin_unlock(&dd->lock);
}
/*
* For zoned block devices, write unlock the target zone of
* completed write requests. Do this while holding the zone lock
* spinlock so that the zone is never unlocked while deadline_fifo_request()
* while deadline_next_request() are executing.
*/
static void dd_completed_request(struct request *rq)
{
struct request_queue *q = rq->q;
if (blk_queue_is_zoned(q)) {
struct deadline_data *dd = q->elevator->elevator_data;
unsigned long flags;
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
spin_unlock_irqrestore(&dd->zone_lock, flags);
}
}
static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
......@@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = {
.ops.mq = {
.insert_requests = dd_insert_requests,
.dispatch_request = dd_dispatch_request,
.completed_request = dd_completed_request,
.next_request = elv_rb_latter_request,
.former_request = elv_rb_former_request,
.bio_merge = dd_bio_merge,
......
......@@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state,
continue;
bsd_start = le32_to_cpu(p->p_offset);
bsd_size = le32_to_cpu(p->p_size);
if (memcmp(flavour, "bsd\0", 4) == 0)
/* FreeBSD has relative offset if C partition offset is zero */
if (memcmp(flavour, "bsd\0", 4) == 0 &&
le32_to_cpu(l->d_partitions[2].p_offset) == 0)
bsd_start += offset;
if (offset == bsd_start && size == bsd_size)
/* full parent partition, we have it already */
......
......@@ -384,9 +384,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
/**
* sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
* @file: file this ioctl operates on (optional)
* @q: request queue to send scsi commands down
* @disk: gendisk to operate on (option)
* @mode: mode used to open the file through which the ioctl has been
* submitted
* @sic: userspace structure describing the command to perform
*
* Send down the scsi command described by @sic to the device below
......@@ -415,10 +416,10 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
* Positive numbers returned are the compacted SCSI error codes (4
* bytes in one int) where the lowest byte is the SCSI status.
*/
#define OMAX_SB_LEN 16 /* For backward compatibility */
int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
struct scsi_ioctl_command __user *sic)
{
enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */
struct request *rq;
struct scsi_request *req;
int err;
......@@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
if (bd && bd == bd->bd_contains)
return 0;
/* Actually none of these is particularly useful on a partition,
* but they are safe.
*/
switch (cmd) {
case SCSI_IOCTL_GET_IDLUN:
case SCSI_IOCTL_GET_BUS_NUMBER:
case SCSI_IOCTL_GET_PCI:
case SCSI_IOCTL_PROBE_HOST:
case SG_GET_VERSION_NUM:
case SG_SET_TIMEOUT:
case SG_GET_TIMEOUT:
case SG_GET_RESERVED_SIZE:
case SG_SET_RESERVED_SIZE:
case SG_EMULATED_HOST:
return 0;
case CDROM_GET_CAPABILITY:
/* Keep this until we remove the printk below. udev sends it
* and we do not want to spam dmesg about it. CD-ROMs do
* not have partitions, so we get here only for disks.
*/
return -ENOIOCTLCMD;
default:
break;
}
if (capable(CAP_SYS_RAWIO))
return 0;
/* In particular, rule out all resets and host-specific ioctls. */
printk_ratelimited(KERN_WARNING
"%s: sending ioctl %x to a partition!\n", current->comm, cmd);
return -ENOIOCTLCMD;
}
EXPORT_SYMBOL(scsi_verify_blk_ioctl);
......
......@@ -106,6 +106,7 @@ config CRYPTO_KPP
config CRYPTO_ACOMP2
tristate
select CRYPTO_ALGAPI2
select SGL_ALLOC
config CRYPTO_ACOMP
tristate
......
......@@ -140,53 +140,6 @@ static int crypto_scomp_init_tfm(struct crypto_tfm *tfm)
return ret;
}
static void crypto_scomp_sg_free(struct scatterlist *sgl)
{
int i, n;
struct page *page;
if (!sgl)
return;
n = sg_nents(sgl);
for_each_sg(sgl, sgl, n, i) {
page = sg_page(sgl);
if (page)
__free_page(page);
}
kfree(sgl);
}
static struct scatterlist *crypto_scomp_sg_alloc(size_t size, gfp_t gfp)
{
struct scatterlist *sgl;
struct page *page;
int i, n;
n = ((size - 1) >> PAGE_SHIFT) + 1;
sgl = kmalloc_array(n, sizeof(struct scatterlist), gfp);
if (!sgl)
return NULL;
sg_init_table(sgl, n);
for (i = 0; i < n; i++) {
page = alloc_page(gfp);
if (!page)
goto err;
sg_set_page(sgl + i, page, PAGE_SIZE, 0);
}
return sgl;
err:
sg_mark_end(sgl + i);
crypto_scomp_sg_free(sgl);
return NULL;
}
static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
{
struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
......@@ -220,7 +173,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir)
scratch_dst, &req->dlen, *ctx);
if (!ret) {
if (!req->dst) {
req->dst = crypto_scomp_sg_alloc(req->dlen, GFP_ATOMIC);
req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL);
if (!req->dst)
goto out;
}
......@@ -274,7 +227,7 @@ int crypto_init_scomp_ops_async(struct crypto_tfm *tfm)
crt->compress = scomp_acomp_compress;
crt->decompress = scomp_acomp_decompress;
crt->dst_free = crypto_scomp_sg_free;
crt->dst_free = sgl_free;
crt->reqsize = sizeof(void *);
return 0;
......
This diff is collapsed.
......@@ -20,6 +20,10 @@ config BLK_DEV_NULL_BLK
tristate "Null test block driver"
select CONFIGFS_FS
config BLK_DEV_NULL_BLK_FAULT_INJECTION
bool "Support fault injection for Null test block driver"
depends on BLK_DEV_NULL_BLK && FAULT_INJECTION
config BLK_DEV_FD
tristate "Normal floppy disk support"
depends on ARCH_MAY_HAVE_PC_FDC
......
......@@ -112,8 +112,7 @@ enum frame_flags {
struct frame {
struct list_head head;
u32 tag;
struct timeval sent; /* high-res time packet was sent */
u32 sent_jiffs; /* low-res jiffies-based sent time */
ktime_t sent; /* high-res time packet was sent */
ulong waited;
ulong waited_total;
struct aoetgt *t; /* parent target I belong to */
......
......@@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d)
skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) {
do_gettimeofday(&f->sent);
f->sent_jiffs = (u32) jiffies;
f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
......@@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f)
skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL)
return;
do_gettimeofday(&f->sent);
f->sent_jiffs = (u32) jiffies;
f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
......@@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f)
static int
tsince_hr(struct frame *f)
{
struct timeval now;
int n;
u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
do_gettimeofday(&now);
n = now.tv_usec - f->sent.tv_usec;
n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
/* delta is normally under 4.2 seconds, avoid 64-bit division */
if (likely(delta <= UINT_MAX))
return (u32)delta / NSEC_PER_USEC;
if (n < 0)
n = -n;
/* avoid overflow after 71 minutes */
if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
return INT_MAX;
/* For relatively long periods, use jiffies to avoid
* discrepancies caused by updates to the system time.
*
* On system with HZ of 1000, 32-bits is over 49 days
* worth of jiffies, or over 71 minutes worth of usecs.
*
* Jiffies overflow is handled by subtraction of unsigned ints:
* (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
* $3 = 4
* (gdb)
*/
if (n > USEC_PER_SEC / 4) {
n = ((u32) jiffies) - f->sent_jiffs;
n *= USEC_PER_SEC / HZ;
}
return n;
return div_u64(delta, NSEC_PER_USEC);
}
static int
......@@ -589,7 +571,6 @@ reassign_frame(struct frame *f)
nf->waited = 0;
nf->waited_total = f->waited_total;
nf->sent = f->sent;
nf->sent_jiffs = f->sent_jiffs;
f->skb = skb;
return nf;
......@@ -633,8 +614,7 @@ probe(struct aoetgt *t)
skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) {
do_gettimeofday(&f->sent);
f->sent_jiffs = (u32) jiffies;
f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
......@@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d)
d->timer.function = rexmit_timer;
skb = skb_clone(skb, GFP_ATOMIC);
if (skb) {
do_gettimeofday(&f->sent);
f->sent_jiffs = (u32) jiffies;
}
if (skb)
f->sent = ktime_get();
return skb;
}
......
......@@ -953,7 +953,7 @@ static void drbd_bm_endio(struct bio *bio)
struct drbd_bm_aio_ctx *ctx = bio->bi_private;
struct drbd_device *device = ctx->device;
struct drbd_bitmap *b = device->bitmap;
unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
!bm_test_page_unchanged(b->bm_pages[idx]))
......
This diff is collapsed.
......@@ -2579,14 +2579,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
bdev = bdget(dev);
if (!bdev)
return -ENOMEM;
ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
if (ret)
return ret;
if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
bdput(bdev);
blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
return -EINVAL;
}
ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
if (ret)
return ret;
/* This is safe, since we have a reference from open(). */
__module_get(THIS_MODULE);
......@@ -2745,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
pd->pkt_dev = MKDEV(pktdev_major, idx);
ret = pkt_new_dev(pd, dev);
if (ret)
goto out_new_dev;
goto out_mem2;
/* inherit events of the host device */
disk->events = pd->bdev->bd_disk->events;
......@@ -2763,8 +2763,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
mutex_unlock(&ctl_mutex);
return 0;
out_new_dev:
blk_cleanup_queue(disk->queue);
out_mem2:
put_disk(disk);
out_mem:
......
This diff is collapsed.
......@@ -430,7 +430,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
static void zram_page_end_io(struct bio *bio)
{
struct page *page = bio->bi_io_vec[0].bv_page;
struct page *page = bio_first_page_all(bio);
page_endio(page, op_is_write(bio_op(bio)),
blk_status_to_errno(bio->bi_status));
......
......@@ -27,13 +27,6 @@ config NVM_DEBUG
It is required to create/remove targets without IOCTLs.
config NVM_RRPC
tristate "Round-robin Hybrid Open-Channel SSD target"
---help---
Allows an open-channel SSD to be exposed as a block device to the
host. The target is implemented using a linear mapping table and
cost-based garbage collection. It is optimized for 4K IO sizes.
config NVM_PBLK
tristate "Physical Block Device Open-Channel SSD target"
---help---
......
......@@ -4,7 +4,6 @@
#
obj-$(CONFIG_NVM) := core.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
obj-$(CONFIG_NVM_PBLK) += pblk.o
pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
pblk-write.o pblk-cache.o pblk-read.o \
......
This diff is collapsed.
......@@ -19,12 +19,16 @@
int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
{
struct request_queue *q = pblk->dev->q;
struct pblk_w_ctx w_ctx;
sector_t lba = pblk_get_lba(bio);
unsigned long start_time = jiffies;
unsigned int bpos, pos;
int nr_entries = pblk_get_secs(bio);
int i, ret;
generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
* rollback from here on.
......@@ -67,6 +71,7 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
pblk_rl_inserted(&pblk->rl, nr_entries);
out:
generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
pblk_write_should_kick(pblk);
return ret;
}
......
......@@ -32,8 +32,8 @@ static void pblk_line_mark_bb(struct work_struct *work)
struct pblk_line *line;
int pos;
line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
line = &pblk->lines[pblk_ppa_to_line(*ppa)];
pos = pblk_ppa_to_pos(&dev->geo, *ppa);
pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
line->id, pos);
......@@ -48,7 +48,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
int pos = pblk_dev_ppa_to_pos(geo, *ppa);
int pos = pblk_ppa_to_pos(geo, *ppa);
pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
atomic_long_inc(&pblk->erase_failed);
......@@ -66,7 +66,7 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
{
struct pblk_line *line;
line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
atomic_dec(&line->left_seblks);
if (rqd->error) {
......@@ -144,7 +144,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
BUG_ON(pblk_ppa_empty(ppa));
#endif
line_id = pblk_tgt_ppa_to_line(ppa);
line_id = pblk_ppa_to_line(ppa);
line = &pblk->lines[line_id];
paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
......@@ -650,7 +650,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
} else {
for (i = 0; i < rqd.nr_ppas; ) {
struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
int pos = pblk_dev_ppa_to_pos(geo, ppa);
int pos = pblk_ppa_to_pos(geo, ppa);
int read_type = PBLK_READ_RANDOM;
if (pblk_io_aligned(pblk, rq_ppas))
......@@ -668,7 +668,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
}
ppa = addr_to_gen_ppa(pblk, paddr, id);
pos = pblk_dev_ppa_to_pos(geo, ppa);
pos = pblk_ppa_to_pos(geo, ppa);
}
if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
......@@ -742,7 +742,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
cmd_op = NVM_OP_PWRITE;
flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
lba_list = emeta_to_lbas(pblk, line->emeta->buf);
} else if (dir == PBLK_READ) {
} else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) {
bio_op = REQ_OP_READ;
cmd_op = NVM_OP_PREAD;
flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
......@@ -802,7 +802,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
if (rqd.error) {
if (dir == PBLK_WRITE)
pblk_log_write_err(pblk, &rqd);
else
else if (dir == PBLK_READ)
pblk_log_read_err(pblk, &rqd);
}
......@@ -816,7 +816,7 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
{
u64 bpaddr = pblk_line_smeta_start(pblk, line);
return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ);
return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV);
}
int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
......@@ -854,8 +854,8 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
struct nvm_geo *geo = &dev->geo;
pr_err("pblk: could not sync erase line:%d,blk:%d\n",
pblk_dev_ppa_to_line(ppa),
pblk_dev_ppa_to_pos(geo, ppa));
pblk_ppa_to_line(ppa),
pblk_ppa_to_pos(geo, ppa));
rqd.error = ret;
goto out;
......@@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
/* Start metadata */
smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns);
smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
/* Fill metadata among lines */
if (cur) {
......@@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
lm->sec_per_line);
bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
lm->sec_per_line);
line->sec_in_line -= geo->sec_per_blk;
line->sec_in_line -= geo->sec_per_chk;
if (bit >= lm->emeta_bb)
nr_bb++;
}
......@@ -1145,7 +1145,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
}
spin_unlock(&l_mg->free_lock);
pblk_rl_free_lines_dec(&pblk->rl, line);
pblk_rl_free_lines_dec(&pblk->rl, line, true);
if (!pblk_line_init_bb(pblk, line, 0)) {
list_add(&line->list, &l_mg->free_list);
......@@ -1233,7 +1233,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk,
l_mg->data_line = retry_line;
spin_unlock(&l_mg->free_lock);
pblk_rl_free_lines_dec(&pblk->rl, retry_line);
pblk_rl_free_lines_dec(&pblk->rl, line, false);
if (pblk_line_erase(pblk, retry_line))
goto retry;
......@@ -1252,7 +1252,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line;
int is_next = 0;
spin_lock(&l_mg->free_lock);
line = pblk_line_get(pblk);
......@@ -1280,7 +1279,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
} else {
l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
l_mg->data_next->type = PBLK_LINETYPE_DATA;
is_next = 1;
}
spin_unlock(&l_mg->free_lock);
......@@ -1290,10 +1288,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
return NULL;
}
pblk_rl_free_lines_dec(&pblk->rl, line);
if (is_next)
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
retry_setup:
if (!pblk_line_init_metadata(pblk, line, NULL)) {
line = pblk_line_retry(pblk, line);
......@@ -1311,6 +1305,8 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
goto retry_setup;
}
pblk_rl_free_lines_dec(&pblk->rl, line, true);
return line;
}
......@@ -1395,7 +1391,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *cur, *new = NULL;
unsigned int left_seblks;
int is_next = 0;
cur = l_mg->data_line;
new = l_mg->data_next;
......@@ -1444,6 +1439,8 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
goto retry_setup;
}
pblk_rl_free_lines_dec(&pblk->rl, new, true);
/* Allocate next line for preparation */
spin_lock(&l_mg->free_lock);
l_mg->data_next = pblk_line_get(pblk);
......@@ -1457,13 +1454,9 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
} else {
l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
l_mg->data_next->type = PBLK_LINETYPE_DATA;
is_next = 1;
}
spin_unlock(&l_mg->free_lock);
if (is_next)
pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
out:
return new;
}
......@@ -1561,8 +1554,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
struct nvm_geo *geo = &dev->geo;
pr_err("pblk: could not async erase line:%d,blk:%d\n",
pblk_dev_ppa_to_line(ppa),
pblk_dev_ppa_to_pos(geo, ppa));
pblk_ppa_to_line(ppa),
pblk_ppa_to_pos(geo, ppa));
}
return err;
......@@ -1746,7 +1739,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_lun *rlun;
int nr_luns = geo->nr_luns;
int nr_luns = geo->all_luns;
int bit = -1;
while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
......@@ -1884,7 +1877,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
/* If the L2P entry maps to a line, the reference is valid */
if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
int line_id = pblk_dev_ppa_to_line(ppa);
int line_id = pblk_ppa_to_line(ppa);
struct pblk_line *line = &pblk->lines[line_id];
kref_get(&line->ref);
......
This diff is collapsed.
This diff is collapsed.
......@@ -146,7 +146,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
return;
/* Erase blocks that are bad in this line but might not be in next */
if (unlikely(ppa_empty(*erase_ppa)) &&
if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
int bit = -1;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
}
void bch_bio_map(struct bio *bio, void *base);
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
static inline sector_t bdev_sectors(struct block_device *bdev)
{
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment