Commit 2cfa582b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.14/dm-changes' of...

Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Various DM persistent-data library improvements and fixes that
   benefit both the DM thinp and cache targets.

 - A few small DM kcopyd efficiency improvements.

 - Significant zoned related block core, DM core and DM zoned target
   changes that culminate with adding zoned append emulation (which is
   required to properly fix DM crypt's zoned support).

 - Various DM writecache target changes that improve efficiency. Adds an
   optional "metadata_only" feature that only promotes bios flagged with
   REQ_META. But the most significant improvement is writecache's
   ability to pause writeback, for a confiurable time, if/when the
   working set is larger than the cache (and the cache is full) -- this
   ensures performance is no worse than the slower origin device.

* tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits)
  dm writecache: make writeback pause configurable
  dm writecache: pause writeback if cache full and origin being written directly
  dm io tracker: factor out IO tracker
  dm btree remove: assign new_root only when removal succeeds
  dm zone: fix dm_revalidate_zones() memory allocation
  dm ps io affinity: remove redundant continue statement
  dm writecache: add optional "metadata_only" parameter
  dm writecache: add "cleaner" and "max_age" to Documentation
  dm writecache: write at least 4k when committing
  dm writecache: flush origin device when writing and cache is full
  dm writecache: have ssd writeback wait if the kcopyd workqueue is busy
  dm writecache: use list_move instead of list_del/list_add in writecache_writeback()
  dm writecache: commit just one block, not a full page
  dm writecache: remove unused gfp_t argument from wc_add_block()
  dm crypt: Fix zoned block device support
  dm: introduce zone append emulation
  dm: rearrange core declarations for extended use from dm-zone.c
  block: introduce BIO_ZONE_WRITE_LOCKED bio flag
  block: introduce bio zone helpers
  block: improve handling of all zones reset operation
  ...
parents dbe69e43 5c0de3d7
......@@ -12,7 +12,6 @@ first sector should contain valid superblock from previous invocation.
Constructor parameters:
1. type of the cache device - "p" or "s"
- p - persistent memory
- s - SSD
2. the underlying device that will be cached
......@@ -21,7 +20,6 @@ Constructor parameters:
size)
5. the number of optional parameters (the parameters with an argument
count as two)
start_sector n (default: 0)
offset from the start of cache device in 512-byte sectors
high_watermark n (default: 50)
......@@ -53,6 +51,27 @@ Constructor parameters:
- some underlying devices perform better with fua, some
with nofua. The user should test it
cleaner
when this option is activated (either in the constructor
arguments or by a message), the cache will not promote
new writes (however, writes to already cached blocks are
promoted, to avoid data corruption due to misordered
writes) and it will gradually writeback any cached
data. The userspace can then monitor the cleaning
process with "dmsetup status". When the number of cached
blocks drops to zero, userspace can unload the
dm-writecache target and replace it with dm-linear or
other targets.
max_age n
specifies the maximum age of a block in milliseconds. If
a block is stored in the cache for too long, it will be
written to the underlying device and cleaned up.
metadata_only
only metadata is promoted to the cache. This option
improves performance for heavier REQ_META workloads.
pause_writeback n (default: 3000)
pause writeback if there was some write I/O redirected to
the origin volume in the last n milliseconds
Status:
1. error indicator - 0 if there was no error, otherwise error number
......@@ -77,3 +96,5 @@ Messages:
5. resume the device, so that it will use the linear
target
6. the cache device is now inactive and it can be deleted
cleaner
See above "cleaner" constructor documentation.
......@@ -161,18 +161,89 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);
static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
sector_t sector,
sector_t nr_sectors)
static inline unsigned long *blk_alloc_zone_bitmap(int node,
unsigned int nr_zones)
{
if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
return false;
return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
GFP_NOIO, node);
}
static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
/*
* REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
* of the applicable zone range is the entire disk.
* For an all-zones reset, ignore conventional, empty, read-only
* and offline zones.
*/
return !sector && nr_sectors == get_capacity(bdev->bd_disk);
switch (zone->cond) {
case BLK_ZONE_COND_NOT_WP:
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_READONLY:
case BLK_ZONE_COND_OFFLINE:
return 0;
default:
set_bit(idx, (unsigned long *)data);
return 0;
}
}
static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
gfp_t gfp_mask)
{
struct request_queue *q = bdev_get_queue(bdev);
sector_t capacity = get_capacity(bdev->bd_disk);
sector_t zone_sectors = blk_queue_zone_sectors(q);
unsigned long *need_reset;
struct bio *bio = NULL;
sector_t sector = 0;
int ret;
need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones);
if (!need_reset)
return -ENOMEM;
ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0,
q->nr_zones, blk_zone_need_reset_cb,
need_reset);
if (ret < 0)
goto out_free_need_reset;
ret = 0;
while (sector < capacity) {
if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) {
sector += zone_sectors;
continue;
}
bio = blk_next_bio(bio, 0, gfp_mask);
bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
/* This may take a while, so be nice to others */
cond_resched();
}
if (bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}
out_free_need_reset:
kfree(need_reset);
return ret;
}
static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
{
struct bio bio;
bio_init(&bio, NULL, 0);
bio_set_dev(&bio, bdev);
bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
return submit_bio_wait(&bio);
}
/**
......@@ -200,7 +271,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
sector_t capacity = get_capacity(bdev->bd_disk);
sector_t end_sector = sector + nr_sectors;
struct bio *bio = NULL;
int ret;
int ret = 0;
if (!blk_queue_is_zoned(q))
return -EOPNOTSUPP;
......@@ -222,20 +293,21 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
return -EINVAL;
/*
* In the case of a zone reset operation over all zones,
* REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
* command. For other devices, we emulate this command behavior by
* identifying the zones needing a reset.
*/
if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
if (!blk_queue_zone_resetall(q))
return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
return blkdev_zone_reset_all(bdev, gfp_mask);
}
while (sector < end_sector) {
bio = blk_next_bio(bio, 0, gfp_mask);
bio_set_dev(bio, bdev);
/*
* Special case for the zone reset operation that reset all
* zones, this is useful for applications like mkfs.
*/
if (op == REQ_OP_ZONE_RESET &&
blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
break;
}
bio->bi_opf = op | REQ_SYNC;
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
......@@ -396,13 +468,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
return ret;
}
static inline unsigned long *blk_alloc_zone_bitmap(int node,
unsigned int nr_zones)
{
return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
GFP_NOIO, node);
}
void blk_queue_free_zone_bitmaps(struct request_queue *q)
{
kfree(q->conv_zones_bitmap);
......
......@@ -92,6 +92,10 @@ ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
endif
ifeq ($(CONFIG_BLK_DEV_ZONED),y)
dm-mod-objs += dm-zone.o
endif
ifeq ($(CONFIG_DM_VERITY_FEC),y)
dm-verity-objs += dm-verity-fec.o
endif
......
......@@ -8,6 +8,7 @@
#include "dm-bio-prison-v2.h"
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include "dm-io-tracker.h"
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
......@@ -39,77 +40,6 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
/*----------------------------------------------------------------*/
struct io_tracker {
spinlock_t lock;
/*
* Sectors of in-flight IO.
*/
sector_t in_flight;
/*
* The time, in jiffies, when this device became idle (if it is
* indeed idle).
*/
unsigned long idle_time;
unsigned long last_update_time;
};
static void iot_init(struct io_tracker *iot)
{
spin_lock_init(&iot->lock);
iot->in_flight = 0ul;
iot->idle_time = 0ul;
iot->last_update_time = jiffies;
}
static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
if (iot->in_flight)
return false;
return time_after(jiffies, iot->idle_time + jifs);
}
static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
bool r;
spin_lock_irq(&iot->lock);
r = __iot_idle_for(iot, jifs);
spin_unlock_irq(&iot->lock);
return r;
}
static void iot_io_begin(struct io_tracker *iot, sector_t len)
{
spin_lock_irq(&iot->lock);
iot->in_flight += len;
spin_unlock_irq(&iot->lock);
}
static void __iot_io_end(struct io_tracker *iot, sector_t len)
{
if (!len)
return;
iot->in_flight -= len;
if (!iot->in_flight)
iot->idle_time = jiffies;
}
static void iot_io_end(struct io_tracker *iot, sector_t len)
{
unsigned long flags;
spin_lock_irqsave(&iot->lock, flags);
__iot_io_end(iot, len);
spin_unlock_irqrestore(&iot->lock, flags);
}
/*----------------------------------------------------------------*/
/*
* Represents a chunk of future work. 'input' allows continuations to pass
* values between themselves, typically error values.
......@@ -470,7 +400,7 @@ struct cache {
struct batcher committer;
struct work_struct commit_ws;
struct io_tracker tracker;
struct dm_io_tracker tracker;
mempool_t migration_pool;
......@@ -866,7 +796,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
if (accountable_bio(cache, bio)) {
pb = get_per_bio_data(bio);
pb->len = bio_sectors(bio);
iot_io_begin(&cache->tracker, pb->len);
dm_iot_io_begin(&cache->tracker, pb->len);
}
}
......@@ -874,7 +804,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio)
{
struct per_bio_data *pb = get_per_bio_data(bio);
iot_io_end(&cache->tracker, pb->len);
dm_iot_io_end(&cache->tracker, pb->len);
}
static void accounted_request(struct cache *cache, struct bio *bio)
......@@ -1642,7 +1572,7 @@ enum busy {
static enum busy spare_migration_bandwidth(struct cache *cache)
{
bool idle = iot_idle_for(&cache->tracker, HZ);
bool idle = dm_iot_idle_for(&cache->tracker, HZ);
sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
cache->sectors_per_block;
......@@ -2603,7 +2533,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
batcher_init(&cache->committer, commit_op, cache,
issue_op, cache, cache->wq);
iot_init(&cache->tracker);
dm_iot_init(&cache->tracker);
init_rwsem(&cache->background_work_lock);
prevent_background_work(cache);
......
......@@ -114,8 +114,27 @@ struct mapped_device {
bool init_tio_pdu:1;
struct srcu_struct io_barrier;
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones;
unsigned int *zwp_offset;
#endif
};
/*
* Bits for the flags field of struct mapped_device.
*/
#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_DEFERRED_REMOVE 6
#define DMF_SUSPENDED_INTERNALLY 7
#define DMF_POST_SUSPENDING 8
#define DMF_EMULATE_ZONE_APPEND 9
void disable_discard(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);
......@@ -130,6 +149,13 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md)
return &md->stats;
}
static inline bool dm_emulate_zone_append(struct mapped_device *md)
{
if (blk_queue_is_zoned(md->queue))
return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
return false;
}
#define DM_TABLE_MAX_DEPTH 16
struct dm_table {
......@@ -173,6 +199,45 @@ struct dm_table {
#endif
};
/*
* One of these is allocated per clone bio.
*/
#define DM_TIO_MAGIC 7282014
struct dm_target_io {
unsigned int magic;
struct dm_io *io;
struct dm_target *ti;
unsigned int target_bio_nr;
unsigned int *len_ptr;
bool inside_dm_io;
struct bio clone;
};
/*
* One of these is allocated per original bio.
* It contains the first clone used for that original.
*/
#define DM_IO_MAGIC 5191977
struct dm_io {
unsigned int magic;
struct mapped_device *md;
blk_status_t status;
atomic_t io_count;
struct bio *orig_bio;
unsigned long start_time;
spinlock_t endio_lock;
struct dm_stats_aux stats_aux;
/* last member of dm_target_io is 'struct bio' */
struct dm_target_io tio;
};
static inline void dm_io_inc_pending(struct dm_io *io)
{
atomic_inc(&io->io_count);
}
void dm_io_dec_pending(struct dm_io *io, blk_status_t error);
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
......
......@@ -3138,11 +3138,10 @@ static int crypt_report_zones(struct dm_target *ti,
struct dm_report_zones_args *args, unsigned int nr_zones)
{
struct crypt_config *cc = ti->private;
sector_t sector = cc->start + dm_target_offset(ti, args->next_sector);
args->start = cc->start;
return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
dm_report_zones_cb, args);
return dm_report_zones(cc->dev->bdev, cc->start,
cc->start + dm_target_offset(ti, args->next_sector),
args, nr_zones);
}
#else
#define crypt_report_zones NULL
......@@ -3281,14 +3280,28 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
cc->start = tmpll;
/*
* For zoned block devices, we need to preserve the issuer write
* ordering. To do so, disable write workqueues and force inline
* encryption completion.
*/
if (bdev_is_zoned(cc->dev->bdev)) {
/*
* For zoned block devices, we need to preserve the issuer write
* ordering. To do so, disable write workqueues and force inline
* encryption completion.
*/
set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags);
/*
* All zone append writes to a zone of a zoned block device will
* have the same BIO sector, the start of the zone. When the
* cypher IV mode uses sector values, all data targeting a
* zone will be encrypted using the first sector numbers of the
* zone. This will not result in write errors but will
* cause most reads to fail as reads will use the sector values
* for the actual data locations, resulting in IV mismatch.
* To avoid this problem, ask DM core to emulate zone append
* operations with regular writes.
*/
DMDEBUG("Zone append operations will be emulated");
ti->emulate_zone_append = true;
}
if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
......
......@@ -363,28 +363,32 @@ static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata
core->root = le64_to_cpu(disk->root);
}
static void ws_inc(void *context, const void *value)
static void ws_inc(void *context, const void *value, unsigned count)
{
struct era_metadata *md = context;
struct writeset_disk ws_d;
dm_block_t b;
unsigned i;
memcpy(&ws_d, value, sizeof(ws_d));
b = le64_to_cpu(ws_d.root);
dm_tm_inc(md->tm, b);
for (i = 0; i < count; i++) {
memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
b = le64_to_cpu(ws_d.root);
dm_tm_inc(md->tm, b);
}
}
static void ws_dec(void *context, const void *value)
static void ws_dec(void *context, const void *value, unsigned count)
{
struct era_metadata *md = context;
struct writeset_disk ws_d;
dm_block_t b;
unsigned i;
memcpy(&ws_d, value, sizeof(ws_d));
b = le64_to_cpu(ws_d.root);
dm_bitset_del(&md->bitset_info, b);
for (i = 0; i < count; i++) {
memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
b = le64_to_cpu(ws_d.root);
dm_bitset_del(&md->bitset_info, b);
}
}
static int ws_eq(void *context, const void *value1, const void *value2)
......
......@@ -463,11 +463,10 @@ static int flakey_report_zones(struct dm_target *ti,
struct dm_report_zones_args *args, unsigned int nr_zones)
{
struct flakey_c *fc = ti->private;
sector_t sector = flakey_map_sector(ti, args->next_sector);
args->start = fc->start;
return blkdev_report_zones(fc->dev->bdev, sector, nr_zones,
dm_report_zones_cb, args);
return dm_report_zones(fc->dev->bdev, fc->start,
flakey_map_sector(ti, args->next_sector),
args, nr_zones);
}
#else
#define flakey_report_zones NULL
......
/*
* Copyright (C) 2021 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#ifndef DM_IO_TRACKER_H
#define DM_IO_TRACKER_H
#include <linux/jiffies.h>
struct dm_io_tracker {
spinlock_t lock;
/*
* Sectors of in-flight IO.
*/
sector_t in_flight;
/*
* The time, in jiffies, when this device became idle
* (if it is indeed idle).
*/
unsigned long idle_time;
unsigned long last_update_time;
};
static inline void dm_iot_init(struct dm_io_tracker *iot)
{
spin_lock_init(&iot->lock);
iot->in_flight = 0ul;
iot->idle_time = 0ul;
iot->last_update_time = jiffies;
}
static inline bool dm_iot_idle_for(struct dm_io_tracker *iot, unsigned long j)
{
bool r = false;
spin_lock_irq(&iot->lock);
if (!iot->in_flight)
r = time_after(jiffies, iot->idle_time + j);
spin_unlock_irq(&iot->lock);
return r;
}
static inline unsigned long dm_iot_idle_time(struct dm_io_tracker *iot)
{
unsigned long r = 0;
spin_lock_irq(&iot->lock);
if (!iot->in_flight)
r = jiffies - iot->idle_time;
spin_unlock_irq(&iot->lock);
return r;
}
static inline void dm_iot_io_begin(struct dm_io_tracker *iot, sector_t len)
{
spin_lock_irq(&iot->lock);
iot->in_flight += len;
spin_unlock_irq(&iot->lock);
}
static inline void dm_iot_io_end(struct dm_io_tracker *iot, sector_t len)
{
unsigned long flags;
if (!len)
return;
spin_lock_irqsave(&iot->lock, flags);
iot->in_flight -= len;
if (!iot->in_flight)
iot->idle_time = jiffies;
spin_unlock_irqrestore(&iot->lock, flags);
}
#endif
......@@ -341,7 +341,7 @@ static void client_free_pages(struct dm_kcopyd_client *kc)
struct kcopyd_job {
struct dm_kcopyd_client *kc;
struct list_head list;
unsigned long flags;
unsigned flags;
/*
* Error state of the job.
......@@ -418,7 +418,7 @@ static struct kcopyd_job *pop_io_job(struct list_head *jobs,
* constraint and sequential writes that are at the right position.
*/
list_for_each_entry(job, jobs, list) {
if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
if (job->rw == READ || !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
list_del(&job->list);
return job;
}
......@@ -437,9 +437,8 @@ static struct kcopyd_job *pop(struct list_head *jobs,
struct dm_kcopyd_client *kc)
{
struct kcopyd_job *job = NULL;
unsigned long flags;
spin_lock_irqsave(&kc->job_lock, flags);
spin_lock_irq(&kc->job_lock);
if (!list_empty(jobs)) {
if (jobs == &kc->io_jobs)
......@@ -449,7 +448,7 @@ static struct kcopyd_job *pop(struct list_head *jobs,
list_del(&job->list);
}
}
spin_unlock_irqrestore(&kc->job_lock, flags);
spin_unlock_irq(&kc->job_lock);
return job;
}
......@@ -467,12 +466,11 @@ static void push(struct list_head *jobs, struct kcopyd_job *job)
static void push_head(struct list_head *jobs, struct kcopyd_job *job)
{
unsigned long flags;
struct dm_kcopyd_client *kc = job->kc;
spin_lock_irqsave(&kc->job_lock, flags);
spin_lock_irq(&kc->job_lock);
list_add(&job->list, jobs);
spin_unlock_irqrestore(&kc->job_lock, flags);
spin_unlock_irq(&kc->job_lock);
}
/*
......@@ -525,7 +523,7 @@ static void complete_io(unsigned long error, void *context)
else
job->read_err = 1;
if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) {
push(&kc->complete_jobs, job);
wake(kc);
return;
......@@ -565,7 +563,7 @@ static int run_io_job(struct kcopyd_job *job)
* If we need to write sequentially and some reads or writes failed,
* no point in continuing.
*/
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
job->master_job->write_err) {
job->write_err = job->master_job->write_err;
return -EIO;
......@@ -648,7 +646,6 @@ static void do_work(struct work_struct *work)
struct dm_kcopyd_client *kc = container_of(work,
struct dm_kcopyd_client, kcopyd_work);
struct blk_plug plug;
unsigned long flags;
/*
* The order that these are called is *very* important.
......@@ -657,9 +654,9 @@ static void do_work(struct work_struct *work)
* list. io jobs call wake when they complete and it all
* starts again.
*/
spin_lock_irqsave(&kc->job_lock, flags);
spin_lock_irq(&kc->job_lock);
list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
spin_unlock_irqrestore(&kc->job_lock, flags);
spin_unlock_irq(&kc->job_lock);
blk_start_plug(&plug);
process_jobs(&kc->complete_jobs, kc, run_complete_job);
......@@ -709,7 +706,7 @@ static void segment_complete(int read_err, unsigned long write_err,
* Only dispatch more work if there hasn't been an error.
*/
if ((!job->read_err && !job->write_err) ||
test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) {
/* get the next chunk of work */
progress = job->progress;
count = job->source.count - progress;
......@@ -801,10 +798,10 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
* we need to write sequentially. If one of the destination is a
* host-aware device, then leave it to the caller to choose what to do.
*/
if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
for (i = 0; i < job->num_dests; i++) {
if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags);
job->flags |= BIT(DM_KCOPYD_WRITE_SEQ);
break;
}
}
......@@ -813,9 +810,9 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
/*
* If we need to write sequentially, errors cannot be ignored.
*/
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags))
clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags);
if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))
job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR);
if (from) {
job->source = *from;
......@@ -983,3 +980,9 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
kfree(kc);
}
EXPORT_SYMBOL(dm_kcopyd_client_destroy);
void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc)
{
flush_workqueue(kc->kcopyd_wq);
}
EXPORT_SYMBOL(dm_kcopyd_client_flush);
......@@ -140,11 +140,10 @@ static int linear_report_zones(struct dm_target *ti,
struct dm_report_zones_args *args, unsigned int nr_zones)
{
struct linear_c *lc = ti->private;
sector_t sector = linear_map_sector(ti, args->next_sector);
args->start = lc->start;
return blkdev_report_zones(lc->dev->bdev, sector, nr_zones,
dm_report_zones_cb, args);
return dm_report_zones(lc->dev->bdev, lc->start,
linear_map_sector(ti, args->next_sector),
args, nr_zones);
}
#else
#define linear_report_zones NULL
......
......@@ -91,7 +91,6 @@ static int ioa_add_path(struct path_selector *ps, struct dm_path *path,
cpumask_set_cpu(cpu, s->path_mask);
s->path_map[cpu] = pi;
refcount_inc(&pi->refcount);
continue;
}
if (refcount_dec_and_test(&pi->refcount)) {
......
......@@ -364,7 +364,7 @@ static void recover(struct mirror_set *ms, struct dm_region *reg)
/* hand to kcopyd */
if (!errors_handled(ms))
set_bit(DM_KCOPYD_IGNORE_ERROR, &flags);
flags |= BIT(DM_KCOPYD_IGNORE_ERROR);
dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
flags, recovery_complete, reg);
......
......@@ -249,7 +249,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
* If the target is mapped to zoned block device(s), check
* that the zones are not partially mapped.
*/
if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) {
if (bdev_is_zoned(bdev)) {
unsigned int zone_sectors = bdev_zone_sectors(bdev);
if (start & (zone_sectors - 1)) {
......@@ -1244,7 +1244,7 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
return args.err;
}
static struct blk_ksm_ll_ops dm_ksm_ll_ops = {
static const struct blk_ksm_ll_ops dm_ksm_ll_ops = {
.keyslot_evict = dm_keyslot_evict,
};
......@@ -1981,11 +1981,12 @@ static int device_requires_stable_pages(struct dm_target *ti,
return blk_queue_stable_writes(q);
}
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
bool wc = false, fua = false;
int page_size = PAGE_SIZE;
int r;
/*
* Copy table's limits to the DM device's request_queue
......@@ -2065,19 +2066,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
/*
* For a zoned target, the number of zones should be updated for the
* correct value to be exposed in sysfs queue/nr_zones. For a BIO based
* target, this is all that is needed.
* For a zoned target, setup the zones related queue attributes
* and resources necessary for zone append emulation if necessary.
*/
#ifdef CONFIG_BLK_DEV_ZONED
if (blk_queue_is_zoned(q)) {
WARN_ON_ONCE(queue_is_mq(q));
q->nr_zones = blkdev_nr_zones(t->md->disk);
r = dm_set_zones_restrictions(t, q);
if (r)
return r;
}
#endif
dm_update_keyslot_manager(q, t);
blk_queue_update_readahead(q);
return 0;
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
......
......@@ -311,28 +311,53 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
*t = v & ((1 << 24) - 1);
}
static void data_block_inc(void *context, const void *value_le)
/*
* It's more efficient to call dm_sm_{inc,dec}_blocks as few times as
* possible. 'with_runs' reads contiguous runs of blocks, and calls the
* given sm function.
*/
typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t);
static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned count, run_fn fn)
{
struct dm_space_map *sm = context;
__le64 v_le;
uint64_t b;
uint64_t b, begin, end;
uint32_t t;
bool in_run = false;
unsigned i;
memcpy(&v_le, value_le, sizeof(v_le));
unpack_block_time(le64_to_cpu(v_le), &b, &t);
dm_sm_inc_block(sm, b);
for (i = 0; i < count; i++, value_le++) {
/* We know value_le is 8 byte aligned */
unpack_block_time(le64_to_cpu(*value_le), &b, &t);
if (in_run) {
if (b == end) {
end++;
} else {
fn(sm, begin, end);
begin = b;
end = b + 1;
}
} else {
in_run = true;
begin = b;
end = b + 1;
}
}
if (in_run)
fn(sm, begin, end);
}
static void data_block_dec(void *context, const void *value_le)
static void data_block_inc(void *context, const void *value_le, unsigned count)
{
struct dm_space_map *sm = context;
__le64 v_le;
uint64_t b;
uint32_t t;
with_runs((struct dm_space_map *) context,
(const __le64 *) value_le, count, dm_sm_inc_blocks);
}
memcpy(&v_le, value_le, sizeof(v_le));
unpack_block_time(le64_to_cpu(v_le), &b, &t);
dm_sm_dec_block(sm, b);
static void data_block_dec(void *context, const void *value_le, unsigned count)
{
with_runs((struct dm_space_map *) context,
(const __le64 *) value_le, count, dm_sm_dec_blocks);
}
static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
......@@ -349,27 +374,25 @@ static int data_block_equal(void *context, const void *value1_le, const void *va
return b1 == b2;
}
static void subtree_inc(void *context, const void *value)
static void subtree_inc(void *context, const void *value, unsigned count)
{
struct dm_btree_info *info = context;
__le64 root_le;
uint64_t root;
const __le64 *root_le = value;
unsigned i;
memcpy(&root_le, value, sizeof(root_le));
root = le64_to_cpu(root_le);
dm_tm_inc(info->tm, root);
for (i = 0; i < count; i++, root_le++)
dm_tm_inc(info->tm, le64_to_cpu(*root_le));
}
static void subtree_dec(void *context, const void *value)
static void subtree_dec(void *context, const void *value, unsigned count)
{
struct dm_btree_info *info = context;
__le64 root_le;
uint64_t root;
const __le64 *root_le = value;
unsigned i;
memcpy(&root_le, value, sizeof(root_le));
root = le64_to_cpu(root_le);
if (dm_btree_del(info, root))
DMERR("btree delete failed");
for (i = 0; i < count; i++, root_le++)
if (dm_btree_del(info, le64_to_cpu(*root_le)))
DMERR("btree delete failed");
}
static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
......@@ -1761,11 +1784,7 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
int r = 0;
pmd_write_lock(pmd);
for (; b != e; b++) {
r = dm_sm_inc_block(pmd->data_sm, b);
if (r)
break;
}
r = dm_sm_inc_blocks(pmd->data_sm, b, e);
pmd_write_unlock(pmd);
return r;
......@@ -1776,11 +1795,7 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
int r = 0;
pmd_write_lock(pmd);
for (; b != e; b++) {
r = dm_sm_dec_block(pmd->data_sm, b);
if (r)
break;
}
r = dm_sm_dec_blocks(pmd->data_sm, b, e);
pmd_write_unlock(pmd);
return r;
......
This diff is collapsed.
This diff is collapsed.
......@@ -1390,6 +1390,13 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data)
return -ENXIO;
}
/*
* Devices that have zones with a capacity smaller than the zone size
* (e.g. NVMe zoned namespaces) are not supported.
*/
if (blkz->capacity != blkz->len)
return -ENXIO;
switch (blkz->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
set_bit(DMZ_RND, &zone->flags);
......
......@@ -134,7 +134,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
dst_zone_block = dmz_start_block(zmd, dst_zone);
if (dmz_is_seq(dst_zone))
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
flags |= BIT(DM_KCOPYD_WRITE_SEQ);
while (block < end_block) {
if (src_zone->dev->flags & DMZ_BDEV_DYING)
......
This diff is collapsed.
......@@ -45,6 +45,8 @@ struct dm_dev_internal {
struct dm_table;
struct dm_md_mempools;
struct dm_target_io;
struct dm_io;
/*-----------------------------------------------------------------
* Internal table functions.
......@@ -56,8 +58,8 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
bool dm_table_has_no_data_devices(struct dm_table *table);
int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits *limits);
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits);
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits);
struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_presuspend_undo_targets(struct dm_table *t);
......@@ -100,6 +102,30 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
*/
#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t))
/*
* Zoned targets related functions.
*/
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
void dm_cleanup_zoned_dev(struct mapped_device *md);
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
int dm_zone_map_bio(struct dm_target_io *io);
#else
static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {}
#define dm_blk_report_zones NULL
static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
{
return false;
}
static inline int dm_zone_map_bio(struct dm_target_io *tio)
{
return DM_MAPIO_KILL;
}
#endif
/*-----------------------------------------------------------------
* A registry of target types.
*---------------------------------------------------------------*/
......
......@@ -108,12 +108,10 @@ static void *element_at(struct dm_array_info *info, struct array_block *ab,
* in an array block.
*/
static void on_entries(struct dm_array_info *info, struct array_block *ab,
void (*fn)(void *, const void *))
void (*fn)(void *, const void *, unsigned))
{
unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
for (i = 0; i < nr_entries; i++)
fn(info->value_type.context, element_at(info, ab, i));
unsigned nr_entries = le32_to_cpu(ab->nr_entries);
fn(info->value_type.context, element_at(info, ab, 0), nr_entries);
}
/*
......@@ -175,19 +173,18 @@ static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
const void *value, unsigned new_nr)
{
unsigned i;
uint32_t nr_entries;
uint32_t nr_entries, delta, i;
struct dm_btree_value_type *vt = &info->value_type;
BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
nr_entries = le32_to_cpu(ab->nr_entries);
for (i = nr_entries; i < new_nr; i++) {
if (vt->inc)
vt->inc(vt->context, value);
delta = new_nr - nr_entries;
if (vt->inc)
vt->inc(vt->context, value, delta);
for (i = nr_entries; i < new_nr; i++)
memcpy(element_at(info, ab, i), value, vt->size);
}
ab->nr_entries = cpu_to_le32(new_nr);
}
......@@ -199,17 +196,16 @@ static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
unsigned new_nr)
{
unsigned i;
uint32_t nr_entries;
uint32_t nr_entries, delta;
struct dm_btree_value_type *vt = &info->value_type;
BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
nr_entries = le32_to_cpu(ab->nr_entries);
for (i = nr_entries; i > new_nr; i--)
if (vt->dec)
vt->dec(vt->context, element_at(info, ab, i - 1));
delta = nr_entries - new_nr;
if (vt->dec)
vt->dec(vt->context, element_at(info, ab, new_nr - 1), delta);
ab->nr_entries = cpu_to_le32(new_nr);
}
......@@ -573,16 +569,17 @@ static int grow(struct resize *resize)
* These are the value_type functions for the btree elements, which point
* to array blocks.
*/
static void block_inc(void *context, const void *value)
static void block_inc(void *context, const void *value, unsigned count)
{
__le64 block_le;
const __le64 *block_le = value;
struct dm_array_info *info = context;
unsigned i;
memcpy(&block_le, value, sizeof(block_le));
dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
for (i = 0; i < count; i++, block_le++)
dm_tm_inc(info->btree_info.tm, le64_to_cpu(*block_le));
}
static void block_dec(void *context, const void *value)
static void __block_dec(void *context, const void *value)
{
int r;
uint64_t b;
......@@ -621,6 +618,13 @@ static void block_dec(void *context, const void *value)
dm_tm_dec(info->btree_info.tm, b);
}
static void block_dec(void *context, const void *value, unsigned count)
{
unsigned i;
for (i = 0; i < count; i++, value += sizeof(__le64))
__block_dec(context, value);
}
static int block_equal(void *context, const void *value1, const void *value2)
{
return !memcmp(value1, value2, sizeof(__le64));
......@@ -711,7 +715,7 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_
return r;
if (vt->inc)
vt->inc(vt->context, element_at(info, ab, i));
vt->inc(vt->context, element_at(info, ab, i), 1);
}
ab->nr_entries = cpu_to_le32(new_nr);
......@@ -822,9 +826,9 @@ static int array_set_value(struct dm_array_info *info, dm_block_t root,
old_value = element_at(info, ab, entry);
if (vt->dec &&
(!vt->equal || !vt->equal(vt->context, old_value, value))) {
vt->dec(vt->context, old_value);
vt->dec(vt->context, old_value, 1);
if (vt->inc)
vt->inc(vt->context, value);
vt->inc(vt->context, value, 1);
}
memcpy(old_value, value, info->value_type.size);
......
......@@ -144,4 +144,17 @@ extern struct dm_block_validator btree_node_validator;
extern void init_le64_type(struct dm_transaction_manager *tm,
struct dm_btree_value_type *vt);
/*
* This returns a shadowed btree leaf that you may modify. In practise
* this means overwrites only, since an insert could cause a node to
* be split. Useful if you need access to the old value to calculate the
* new one.
*
* This only works with single level btrees. The given key must be present in
* the tree, otherwise -EINVAL will be returned.
*/
int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
uint64_t key, int *index,
dm_block_t *new_root, struct dm_block **leaf);
#endif /* DM_BTREE_INTERNAL_H */
......@@ -544,12 +544,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
if (info->value_type.dec)
info->value_type.dec(info->value_type.context,
value_ptr(n, index));
value_ptr(n, index), 1);
delete_at(n, index);
}
*new_root = shadow_root(&spine);
if (!r)
*new_root = shadow_root(&spine);
exit_shadow_spine(&spine);
return r;
......@@ -653,7 +654,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
if (k >= keys[last_level] && k < end_key) {
if (info->value_type.dec)
info->value_type.dec(info->value_type.context,
value_ptr(n, index));
value_ptr(n, index), 1);
delete_at(n, index);
keys[last_level] = k + 1ull;
......
......@@ -236,22 +236,14 @@ dm_block_t shadow_root(struct shadow_spine *s)
return s->root;
}
static void le64_inc(void *context, const void *value_le)
static void le64_inc(void *context, const void *value_le, unsigned count)
{
struct dm_transaction_manager *tm = context;
__le64 v_le;
memcpy(&v_le, value_le, sizeof(v_le));
dm_tm_inc(tm, le64_to_cpu(v_le));
dm_tm_with_runs(context, value_le, count, dm_tm_inc_range);
}
static void le64_dec(void *context, const void *value_le)
static void le64_dec(void *context, const void *value_le, unsigned count)
{
struct dm_transaction_manager *tm = context;
__le64 v_le;
memcpy(&v_le, value_le, sizeof(v_le));
dm_tm_dec(tm, le64_to_cpu(v_le));
dm_tm_with_runs(context, value_le, count, dm_tm_dec_range);
}
static int le64_equal(void *context, const void *value1_le, const void *value2_le)
......
This diff is collapsed.
......@@ -51,21 +51,21 @@ struct dm_btree_value_type {
*/
/*
* The btree is making a duplicate of the value, for instance
* The btree is making a duplicate of a run of values, for instance
* because previously-shared btree nodes have now diverged.
* @value argument is the new copy that the copy function may modify.
* (Probably it just wants to increment a reference count
* somewhere.) This method is _not_ called for insertion of a new
* value: It is assumed the ref count is already 1.
*/
void (*inc)(void *context, const void *value);
void (*inc)(void *context, const void *value, unsigned count);
/*
* This value is being deleted. The btree takes care of freeing
* These values are being deleted. The btree takes care of freeing
* the memory pointed to by @value. Often the del function just
* needs to decrement a reference count somewhere.
* needs to decrement a reference counts somewhere.
*/
void (*dec)(void *context, const void *value);
void (*dec)(void *context, const void *value, unsigned count);
/*
* A test for equality between two values. When a value is
......
......@@ -54,6 +54,20 @@ typedef int (*open_index_fn)(struct ll_disk *ll);
typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
typedef int (*commit_fn)(struct ll_disk *ll);
/*
* A lot of time can be wasted reading and writing the same
* index entry. So we cache a few entries.
*/
#define IE_CACHE_SIZE 64
#define IE_CACHE_MASK (IE_CACHE_SIZE - 1)
struct ie_cache {
bool valid;
bool dirty;
dm_block_t index;
struct disk_index_entry ie;
};
struct ll_disk {
struct dm_transaction_manager *tm;
struct dm_btree_info bitmap_info;
......@@ -79,6 +93,8 @@ struct ll_disk {
max_index_entries_fn max_entries;
commit_fn commit;
bool bitmap_index_changed:1;
struct ie_cache ie_cache[IE_CACHE_SIZE];
};
struct disk_sm_root {
......@@ -96,12 +112,6 @@ struct disk_bitmap_header {
__le64 blocknr;
} __attribute__ ((packed, aligned(8)));
enum allocation_event {
SM_NONE,
SM_ALLOC,
SM_FREE,
};
/*----------------------------------------------------------------*/
int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
......@@ -111,9 +121,15 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
dm_block_t end, dm_block_t *result);
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
dm_block_t begin, dm_block_t end, dm_block_t *result);
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
/*
* The next three functions return (via nr_allocations) the net number of
* allocations that were made. This number may be negative if there were
* more frees than allocs.
*/
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, int32_t *nr_allocations);
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations);
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations);
int sm_ll_commit(struct ll_disk *ll);
int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
......
......@@ -87,76 +87,39 @@ static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
uint32_t count)
{
int r;
uint32_t old_count;
enum allocation_event ev;
int32_t nr_allocations;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
r = sm_ll_insert(&smd->ll, b, count, &ev);
r = sm_ll_insert(&smd->ll, b, count, &nr_allocations);
if (!r) {
switch (ev) {
case SM_NONE:
break;
case SM_ALLOC:
/*
* This _must_ be free in the prior transaction
* otherwise we've lost atomicity.
*/
smd->nr_allocated_this_transaction++;
break;
case SM_FREE:
/*
* It's only free if it's also free in the last
* transaction.
*/
r = sm_ll_lookup(&smd->old_ll, b, &old_count);
if (r)
return r;
if (!old_count)
smd->nr_allocated_this_transaction--;
break;
}
smd->nr_allocated_this_transaction += nr_allocations;
}
return r;
}
static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r;
enum allocation_event ev;
int32_t nr_allocations;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
r = sm_ll_inc(&smd->ll, b, &ev);
if (!r && (ev == SM_ALLOC))
/*
* This _must_ be free in the prior transaction
* otherwise we've lost atomicity.
*/
smd->nr_allocated_this_transaction++;
r = sm_ll_inc(&smd->ll, b, e, &nr_allocations);
if (!r)
smd->nr_allocated_this_transaction += nr_allocations;
return r;
}
static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r;
uint32_t old_count;
enum allocation_event ev;
int32_t nr_allocations;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
r = sm_ll_dec(&smd->ll, b, &ev);
if (!r && (ev == SM_FREE)) {
/*
* It's only free if it's also free in the last
* transaction.
*/
r = sm_ll_lookup(&smd->old_ll, b, &old_count);
if (!r && !old_count)
smd->nr_allocated_this_transaction--;
}
r = sm_ll_dec(&smd->ll, b, e, &nr_allocations);
if (!r)
smd->nr_allocated_this_transaction += nr_allocations;
return r;
}
......@@ -164,21 +127,28 @@ static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
{
int r;
enum allocation_event ev;
int32_t nr_allocations;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
/*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
if (r == -ENOSPC) {
/*
* There's no free block between smd->begin and the end of the metadata device.
* We search before smd->begin in case something has been freed.
*/
r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b);
}
if (r)
return r;
smd->begin = *b + 1;
r = sm_ll_inc(&smd->ll, *b, &ev);
r = sm_ll_inc(&smd->ll, *b, *b + 1, &nr_allocations);
if (!r) {
BUG_ON(ev != SM_ALLOC);
smd->nr_allocated_this_transaction++;
smd->nr_allocated_this_transaction += nr_allocations;
}
return r;
......@@ -194,7 +164,6 @@ static int sm_disk_commit(struct dm_space_map *sm)
return r;
memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
smd->begin = 0;
smd->nr_allocated_this_transaction = 0;
return 0;
......@@ -235,8 +204,8 @@ static struct dm_space_map ops = {
.get_count = sm_disk_get_count,
.count_is_more_than_one = sm_disk_count_is_more_than_one,
.set_count = sm_disk_set_count,
.inc_block = sm_disk_inc_block,
.dec_block = sm_disk_dec_block,
.inc_blocks = sm_disk_inc_blocks,
.dec_blocks = sm_disk_dec_blocks,
.new_block = sm_disk_new_block,
.commit = sm_disk_commit,
.root_size = sm_disk_root_size,
......
......@@ -89,7 +89,8 @@ enum block_op_type {
struct block_op {
enum block_op_type type;
dm_block_t block;
dm_block_t b;
dm_block_t e;
};
struct bop_ring_buffer {
......@@ -116,7 +117,7 @@ static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
}
static int brb_push(struct bop_ring_buffer *brb,
enum block_op_type type, dm_block_t b)
enum block_op_type type, dm_block_t b, dm_block_t e)
{
struct block_op *bop;
unsigned next = brb_next(brb, brb->end);
......@@ -130,7 +131,8 @@ static int brb_push(struct bop_ring_buffer *brb,
bop = brb->bops + brb->end;
bop->type = type;
bop->block = b;
bop->b = b;
bop->e = e;
brb->end = next;
......@@ -145,9 +147,7 @@ static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result)
return -ENODATA;
bop = brb->bops + brb->begin;
result->type = bop->type;
result->block = bop->block;
memcpy(result, bop, sizeof(*result));
return 0;
}
......@@ -178,10 +178,9 @@ struct sm_metadata {
struct threshold threshold;
};
static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b, dm_block_t e)
{
int r = brb_push(&smm->uncommitted, type, b);
int r = brb_push(&smm->uncommitted, type, b, e);
if (r) {
DMERR("too many recursive allocations");
return -ENOMEM;
......@@ -193,15 +192,15 @@ static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t
static int commit_bop(struct sm_metadata *smm, struct block_op *op)
{
int r = 0;
enum allocation_event ev;
int32_t nr_allocations;
switch (op->type) {
case BOP_INC:
r = sm_ll_inc(&smm->ll, op->block, &ev);
r = sm_ll_inc(&smm->ll, op->b, op->e, &nr_allocations);
break;
case BOP_DEC:
r = sm_ll_dec(&smm->ll, op->block, &ev);
r = sm_ll_dec(&smm->ll, op->b, op->e, &nr_allocations);
break;
}
......@@ -314,7 +313,7 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
i = brb_next(&smm->uncommitted, i)) {
struct block_op *op = smm->uncommitted.bops + i;
if (op->block != b)
if (b < op->b || b >= op->e)
continue;
switch (op->type) {
......@@ -355,7 +354,7 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
struct block_op *op = smm->uncommitted.bops + i;
if (op->block != b)
if (b < op->b || b >= op->e)
continue;
switch (op->type) {
......@@ -393,7 +392,7 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
uint32_t count)
{
int r, r2;
enum allocation_event ev;
int32_t nr_allocations;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
if (smm->recursion_count) {
......@@ -402,40 +401,42 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
}
in(smm);
r = sm_ll_insert(&smm->ll, b, count, &ev);
r = sm_ll_insert(&smm->ll, b, count, &nr_allocations);
r2 = out(smm);
return combine_errors(r, r2);
}
static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
static int sm_metadata_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r, r2 = 0;
enum allocation_event ev;
int32_t nr_allocations;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
if (recursing(smm))
r = add_bop(smm, BOP_INC, b);
else {
if (recursing(smm)) {
r = add_bop(smm, BOP_INC, b, e);
if (r)
return r;
} else {
in(smm);
r = sm_ll_inc(&smm->ll, b, &ev);
r = sm_ll_inc(&smm->ll, b, e, &nr_allocations);
r2 = out(smm);
}
return combine_errors(r, r2);
}
static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
static int sm_metadata_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r, r2 = 0;
enum allocation_event ev;
int32_t nr_allocations;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
if (recursing(smm))
r = add_bop(smm, BOP_DEC, b);
r = add_bop(smm, BOP_DEC, b, e);
else {
in(smm);
r = sm_ll_dec(&smm->ll, b, &ev);
r = sm_ll_dec(&smm->ll, b, e, &nr_allocations);
r2 = out(smm);
}
......@@ -445,23 +446,31 @@ static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
{
int r, r2 = 0;
enum allocation_event ev;
int32_t nr_allocations;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
/*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
if (r == -ENOSPC) {
/*
* There's no free block between smm->begin and the end of the metadata device.
* We search before smm->begin in case something has been freed.
*/
r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b);
}
if (r)
return r;
smm->begin = *b + 1;
if (recursing(smm))
r = add_bop(smm, BOP_INC, *b);
r = add_bop(smm, BOP_INC, *b, *b + 1);
else {
in(smm);
r = sm_ll_inc(&smm->ll, *b, &ev);
r = sm_ll_inc(&smm->ll, *b, *b + 1, &nr_allocations);
r2 = out(smm);
}
......@@ -503,7 +512,6 @@ static int sm_metadata_commit(struct dm_space_map *sm)
return r;
memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
smm->begin = 0;
smm->allocated_this_transaction = 0;
return 0;
......@@ -556,8 +564,8 @@ static const struct dm_space_map ops = {
.get_count = sm_metadata_get_count,
.count_is_more_than_one = sm_metadata_count_is_more_than_one,
.set_count = sm_metadata_set_count,
.inc_block = sm_metadata_inc_block,
.dec_block = sm_metadata_dec_block,
.inc_blocks = sm_metadata_inc_blocks,
.dec_blocks = sm_metadata_dec_blocks,
.new_block = sm_metadata_new_block,
.commit = sm_metadata_commit,
.root_size = sm_metadata_root_size,
......@@ -641,18 +649,28 @@ static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b)
return 0;
}
static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b)
static int sm_bootstrap_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
return add_bop(smm, BOP_INC, b);
r = add_bop(smm, BOP_INC, b, e);
if (r)
return r;
return 0;
}
static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b)
static int sm_bootstrap_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
int r;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
return add_bop(smm, BOP_DEC, b);
r = add_bop(smm, BOP_DEC, b, e);
if (r)
return r;
return 0;
}
static int sm_bootstrap_commit(struct dm_space_map *sm)
......@@ -683,8 +701,8 @@ static const struct dm_space_map bootstrap_ops = {
.get_count = sm_bootstrap_get_count,
.count_is_more_than_one = sm_bootstrap_count_is_more_than_one,
.set_count = sm_bootstrap_set_count,
.inc_block = sm_bootstrap_inc_block,
.dec_block = sm_bootstrap_dec_block,
.inc_blocks = sm_bootstrap_inc_blocks,
.dec_blocks = sm_bootstrap_dec_blocks,
.new_block = sm_bootstrap_new_block,
.commit = sm_bootstrap_commit,
.root_size = sm_bootstrap_root_size,
......@@ -696,7 +714,7 @@ static const struct dm_space_map bootstrap_ops = {
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
{
int r, i;
int r;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
dm_block_t old_len = smm->ll.nr_blocks;
......@@ -718,9 +736,7 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
* allocate any new blocks.
*/
do {
for (i = old_len; !r && i < smm->begin; i++)
r = add_bop(smm, BOP_INC, i);
r = add_bop(smm, BOP_INC, old_len, smm->begin);
if (r)
goto out;
......@@ -767,7 +783,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
dm_block_t superblock)
{
int r;
dm_block_t i;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
smm->begin = superblock + 1;
......@@ -792,9 +807,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
* Now we need to update the newly created data structures with the
* allocated blocks that they were built from.
*/
for (i = superblock; !r && i < smm->begin; i++)
r = add_bop(smm, BOP_INC, i);
r = add_bop(smm, BOP_INC, superblock, smm->begin);
if (r)
return r;
......
......@@ -46,8 +46,8 @@ struct dm_space_map {
int (*commit)(struct dm_space_map *sm);
int (*inc_block)(struct dm_space_map *sm, dm_block_t b);
int (*dec_block)(struct dm_space_map *sm, dm_block_t b);
int (*inc_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e);
int (*dec_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e);
/*
* new_block will increment the returned block.
......@@ -117,14 +117,24 @@ static inline int dm_sm_commit(struct dm_space_map *sm)
return sm->commit(sm);
}
static inline int dm_sm_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
return sm->inc_blocks(sm, b, e);
}
static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b)
{
return sm->inc_block(sm, b);
return dm_sm_inc_blocks(sm, b, b + 1);
}
static inline int dm_sm_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
{
return sm->dec_blocks(sm, b, e);
}
static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b)
{
return sm->dec_block(sm, b);
return dm_sm_dec_blocks(sm, b, b + 1);
}
static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b)
......
......@@ -359,6 +359,17 @@ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
}
EXPORT_SYMBOL_GPL(dm_tm_inc);
void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e)
{
/*
* The non-blocking clone doesn't support this.
*/
BUG_ON(tm->is_clone);
dm_sm_inc_blocks(tm->sm, b, e);
}
EXPORT_SYMBOL_GPL(dm_tm_inc_range);
void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
{
/*
......@@ -370,6 +381,47 @@ void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
}
EXPORT_SYMBOL_GPL(dm_tm_dec);
void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e)
{
/*
* The non-blocking clone doesn't support this.
*/
BUG_ON(tm->is_clone);
dm_sm_dec_blocks(tm->sm, b, e);
}
EXPORT_SYMBOL_GPL(dm_tm_dec_range);
void dm_tm_with_runs(struct dm_transaction_manager *tm,
const __le64 *value_le, unsigned count, dm_tm_run_fn fn)
{
uint64_t b, begin, end;
bool in_run = false;
unsigned i;
for (i = 0; i < count; i++, value_le++) {
b = le64_to_cpu(*value_le);
if (in_run) {
if (b == end)
end++;
else {
fn(tm, begin, end);
begin = b;
end = b + 1;
}
} else {
in_run = true;
begin = b;
end = b + 1;
}
}
if (in_run)
fn(tm, begin, end);
}
EXPORT_SYMBOL_GPL(dm_tm_with_runs);
int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
uint32_t *result)
{
......@@ -379,6 +431,15 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
return dm_sm_get_count(tm->sm, b, result);
}
int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b,
int *result)
{
if (tm->is_clone)
return -EWOULDBLOCK;
return dm_sm_count_is_more_than_one(tm->sm, b, result);
}
struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
{
return tm->bm;
......
......@@ -100,11 +100,27 @@ void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
* Functions for altering the reference count of a block directly.
*/
void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b);
void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e);
void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b);
void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e);
/*
* Builds up runs of adjacent blocks, and then calls the given fn
* (typically dm_tm_inc/dec). Very useful when you have to perform
* the same tm operation on all values in a btree leaf.
*/
typedef void (*dm_tm_run_fn)(struct dm_transaction_manager *, dm_block_t, dm_block_t);
void dm_tm_with_runs(struct dm_transaction_manager *tm,
const __le64 *value_le, unsigned count, dm_tm_run_fn fn);
int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
uint32_t *result);
int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result);
/*
* Finds out if a given block is shared (ie. has a reference count higher
* than one).
*/
int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b,
int *result);
struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
......
......@@ -300,6 +300,7 @@ enum {
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
BIO_TRACKED, /* set if bio goes through the rq_qos path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
BIO_FLAG_LAST
};
......
......@@ -1012,6 +1012,18 @@ static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);
static inline unsigned int bio_zone_no(struct bio *bio)
{
return blk_queue_zone_no(bdev_get_queue(bio->bi_bdev),
bio->bi_iter.bi_sector);
}
static inline unsigned int bio_zone_is_seq(struct bio *bio)
{
return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
bio->bi_iter.bi_sector);
}
static inline unsigned int blk_rq_zone_no(struct request *rq)
{
return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
......
......@@ -361,6 +361,12 @@ struct dm_target {
* Set if we need to limit the number of in-flight bios when swapping.
*/
bool limit_swap_bios:1;
/*
* Set if this target implements a a zoned device and needs emulation of
* zone append operations using regular writes.
*/
bool emulate_zone_append:1;
};
void *dm_per_bio_data(struct bio *bio, size_t data_size);
......@@ -478,7 +484,8 @@ struct dm_report_zones_args {
/* must be filled by ->report_zones before calling dm_report_zones_cb */
sector_t start;
};
int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data);
int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
struct dm_report_zones_args *args, unsigned int nr_zones);
#endif /* CONFIG_BLK_DEV_ZONED */
/*
......
......@@ -51,6 +51,7 @@ MODULE_PARM_DESC(name, description)
struct dm_kcopyd_client;
struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle);
void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc);
void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc);
/*
* Submit a copy job to kcopyd. This is built on top of the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment