Commit e9f8ca0a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.6/dm-changes' of...

Merge tag 'for-5.6/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Fix DM core's potential for q->make_request_fn NULL pointer in the
   unlikely case that a DM device is created without a DM table and then
   accessed due to upper-layer userspace code or user error.

 - Fix DM thin-provisioning's metadata_pre_commit_callback to not use
   memory after it is free'd. Also refactor code to disallow changing
   the thin-pool's data device once in use -- doing so guarantees smae
   lifetime of pool's data device relative to the pool metadata.

 - Fix DM space maps used by DM thinp and DM cache to avoid reuse of a
   already used block. This race was identified with extremely heavy
   snapshot use in the context of DM thin provisioning.

 - Fix DM raid's table status relative to an active rebuild.

 - Fix DM crypt to use GFP_NOIO rather than GFP_NOFS in call to
   skcipher_request_alloc(). Also fix benbi IV constructor crash if used
   in authenticated mode.

 - Add DM crypt support for Elephant diffuser to allow for Bitlocker
   compatibility.

 - Fix DM verity target to not prefetch hash blocks for data that has
   already been verified.

 - Fix DM writecache's incorrect flush sequence during commit when in
   SSD mode.

 - Improve DM writecache's sequential write performance on SSDs.

 - Add DM zoned target support for zone sizes smaller than 128MiB.

 - Add DM multipath 'queue_if_no_path_timeout_secs' module param to
   allow timeout if path isn't reinstated. This allows users a kernel
   safety-net against IO hanging indefinitely, due to no active paths,
   that has historically only been provided by multipathd userspace.

 - Various DM code cleanups to use true/false rather than 1/0, a
   variable rename in dm-dust, and fix for a math error in comment for
   DM thin metadata's ondisk format.

* tag 'for-5.6/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (21 commits)
  dm: fix potential for q->make_request_fn NULL pointer
  dm writecache: improve performance of large linear writes on SSDs
  dm mpath: Add timeout mechanism for queue_if_no_path
  dm thin: change data device's flush_bio to be member of struct pool
  dm thin: don't allow changing data device during thin-pool reload
  dm thin: fix use-after-free in metadata_pre_commit_callback
  dm thin metadata: use pool locking at end of dm_pool_metadata_close
  dm writecache: fix incorrect flush sequence when doing SSD mode commit
  dm crypt: fix benbi IV constructor crash if used in authenticated mode
  dm crypt: Implement Elephant diffuser for Bitlocker compatibility
  dm space map common: fix to ensure new block isn't already in use
  dm verity: don't prefetch hash blocks for already-verified data
  dm crypt: fix GFP flags passed to skcipher_request_alloc()
  dm thin metadata: Fix trivial math error in on-disk format documentation
  dm thin metadata: use true/false for bool variable
  dm snapshot: use true/false for bool variable
  dm bio prison v2: use true/false for bool variable
  dm mpath: use true/false for bool variable
  dm zoned: support zone sizes smaller than 128MiB
  dm raid: table line rebuild status fixes
  ...
parents 05ef8b97 47ace7e0
......@@ -419,3 +419,5 @@ Version History
rebuild errors.
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
pages allocated; also fix those not occuring after previous reductions
1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode)
on the status line.
......@@ -324,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
bio_list_init(&cell->bios);
if (cell->shared_count) {
cell->exclusive_lock = 0;
cell->exclusive_lock = false;
return false;
}
......
This diff is collapsed.
......@@ -207,16 +207,16 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
int ret = DM_MAPIO_REMAPPED;
int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
ret = __dust_map_write(dd, thisblock);
r = __dust_map_write(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
return ret;
return r;
}
static int dust_map(struct dm_target *ti, struct bio *bio)
......
......@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <scsi/scsi_dh.h>
......@@ -29,6 +30,9 @@
#define DM_MSG_PREFIX "multipath"
#define DM_PG_INIT_DELAY_MSECS 2000
#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
/* Path properties */
struct pgpath {
......@@ -91,6 +95,8 @@ struct multipath {
struct work_struct process_queued_bios;
struct bio_list queued_bios;
struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
};
/*
......@@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work);
static void activate_or_offline_path(struct pgpath *pgpath);
static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work);
static void queue_if_no_path_timeout_work(struct timer_list *t);
/*-----------------------------------------------
* Multipath state flags.
......@@ -195,6 +202,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->ti = ti;
ti->private = m;
timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
}
return m;
......@@ -717,6 +726,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
return 0;
}
/*
* If the queue_if_no_path timeout fires, turn off queue_if_no_path and
* process any queued I/O.
*/
static void queue_if_no_path_timeout_work(struct timer_list *t)
{
struct multipath *m = from_timer(m, t, nopath_timer);
struct mapped_device *md = dm_table_get_md(m->ti->table);
DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
queue_if_no_path(m, false, false);
}
/*
* Enable the queue_if_no_path timeout if necessary.
* Called with m->lock held.
*/
static void enable_nopath_timeout(struct multipath *m)
{
unsigned long queue_if_no_path_timeout =
READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
lockdep_assert_held(&m->lock);
if (queue_if_no_path_timeout > 0 &&
atomic_read(&m->nr_valid_paths) == 0 &&
test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
mod_timer(&m->nopath_timer,
jiffies + queue_if_no_path_timeout);
}
}
static void disable_nopath_timeout(struct multipath *m)
{
del_timer_sync(&m->nopath_timer);
}
/*
* An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass.
......@@ -1090,6 +1136,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
unsigned long flags;
as.argc = argc;
as.argv = argv;
......@@ -1154,6 +1201,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
spin_lock_irqsave(&m->lock, flags);
enable_nopath_timeout(m);
spin_unlock_irqrestore(&m->lock, flags);
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
......@@ -1208,6 +1259,7 @@ static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = ti->private;
disable_nopath_timeout(m);
flush_multipath_work(m);
free_multipath(m);
}
......@@ -1241,6 +1293,8 @@ static int fail_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
enable_nopath_timeout(m);
out:
spin_unlock_irqrestore(&m->lock, flags);
......@@ -1291,6 +1345,9 @@ static int reinstate_path(struct pgpath *pgpath)
process_queued_io_list(m);
}
if (pgpath->is_active)
disable_nopath_timeout(m);
return r;
}
......@@ -1444,7 +1501,7 @@ static void pg_init_done(void *data, int errors)
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
delay_retry = 1;
delay_retry = true;
/* fall through */
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
......@@ -1789,6 +1846,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
struct dm_dev *dev;
struct multipath *m = ti->private;
action_fn action;
unsigned long flags;
mutex_lock(&m->work_mutex);
......@@ -1800,9 +1858,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, true, false);
spin_lock_irqsave(&m->lock, flags);
enable_nopath_timeout(m);
spin_unlock_irqrestore(&m->lock, flags);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, false, false);
disable_nopath_timeout(m);
goto out;
}
}
......@@ -2065,6 +2127,10 @@ static void __exit dm_multipath_exit(void)
module_init(dm_multipath_init);
module_exit(dm_multipath_exit);
module_param_named(queue_if_no_path_timeout_secs,
queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
MODULE_DESCRIPTION(DM_NAME " multipath target");
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
......@@ -129,7 +129,9 @@ struct raid_dev {
CTR_FLAG_RAID10_COPIES | \
CTR_FLAG_RAID10_FORMAT | \
CTR_FLAG_DELTA_DISKS | \
CTR_FLAG_DATA_OFFSET)
CTR_FLAG_DATA_OFFSET | \
CTR_FLAG_JOURNAL_DEV | \
CTR_FLAG_JOURNAL_MODE)
/* Valid options definitions per raid level... */
......@@ -3001,7 +3003,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{ 1, 254, "Cannot understand number of raid devices parameters" }
};
/* Must have <raid_type> */
arg = dm_shift_arg(&as);
if (!arg) {
ti->error = "No arguments";
......@@ -3508,8 +3509,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
unsigned long recovery;
unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
unsigned int sz = 0;
unsigned int rebuild_disks;
unsigned int write_mostly_params = 0;
unsigned int rebuild_writemostly_count = 0;
sector_t progress, resync_max_sectors, resync_mismatches;
enum sync_state state;
struct raid_type *rt;
......@@ -3593,18 +3593,20 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE:
/* Report the table line string you would use to construct this raid set */
/* Calculate raid parameter count */
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
write_mostly_params += 2;
rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks));
raid_param_cnt += rebuild_disks * 2 +
write_mostly_params +
/*
* Count any rebuild or writemostly argument pairs and subtract the
* hweight count being added below of any rebuild and writemostly ctr flags.
*/
for (i = 0; i < rs->raid_disks; i++) {
rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) +
(test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0);
}
rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) +
(test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0);
/* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */
raid_param_cnt += rebuild_writemostly_count +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
(test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
(test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
/* Emit table line */
/* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
......@@ -3612,11 +3614,10 @@ static void raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (rebuild_disks)
if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
rs->dev[i].rdev.raid_disk);
if (test_bit(i, (void *) rs->rebuild_disks))
DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i);
if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
mddev->bitmap_info.daemon_sleep);
......@@ -3626,7 +3627,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
mddev->sync_speed_max);
if (write_mostly_params)
if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
......@@ -4029,7 +4030,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
.version = {1, 15, 0},
.version = {1, 15, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
......
......@@ -1061,7 +1061,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
DMERR("Read error in exception store: "
"shutting down merge");
down_write(&s->lock);
s->merge_failed = 1;
s->merge_failed = true;
up_write(&s->lock);
}
goto shut;
......@@ -1149,7 +1149,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
shut:
down_write(&s->lock);
s->merge_failed = 1;
s->merge_failed = true;
b = __release_queued_bios_after_merge(s);
up_write(&s->lock);
error_bios(b);
......@@ -1314,7 +1314,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
s->merge_failed = 0;
s->merge_failed = false;
s->first_merging_chunk = 0;
s->num_merging_chunks = 0;
bio_list_init(&s->bios_queued_during_merge);
......
......@@ -28,7 +28,7 @@
*
* - A hierarchical btree, with 2 levels which effectively maps (thin
* dev id, virtual block) -> block_time. Block time is a 64-bit
* field holding the time in the low 24 bits, and block in the top 48
* field holding the time in the low 24 bits, and block in the top 40
* bits.
*
* BTrees consist solely of btree_nodes, that fill a block. Some are
......@@ -387,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
* Variant that is used for in-core only changes or code that
* shouldn't put the pool in service on its own (e.g. commit).
*/
static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
__acquires(pmd->root_lock)
{
down_write(&pmd->root_lock);
}
#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
{
__pmd_write_lock(pmd);
pmd_write_lock_in_core(pmd);
if (unlikely(!pmd->in_service))
pmd->in_service = true;
}
......@@ -811,7 +810,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
return r;
if (td->open_count)
td->changed = 0;
td->changed = false;
else {
list_del(&td->list);
kfree(td);
......@@ -831,6 +830,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
* We need to know if the thin_disk_superblock exceeds a 512-byte sector.
*/
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
BUG_ON(!rwsem_is_locked(&pmd->root_lock));
if (unlikely(!pmd->in_service))
return 0;
......@@ -953,6 +953,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return -EBUSY;
}
pmd_write_lock_in_core(pmd);
if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
r = __commit_transaction(pmd);
if (r < 0)
......@@ -961,6 +962,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
}
if (!pmd->fail_io)
__destroy_persistent_data_objects(pmd);
pmd_write_unlock(pmd);
kfree(pmd);
return 0;
......@@ -1106,7 +1108,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd,
if (r)
return r;
td->changed = 1;
td->changed = true;
td->snapshotted_time = time;
snap->mapped_blocks = td->mapped_blocks;
......@@ -1618,7 +1620,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
if (r)
return r;
td->changed = 1;
td->changed = true;
if (inserted)
td->mapped_blocks++;
......@@ -1649,7 +1651,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
return r;
td->mapped_blocks--;
td->changed = 1;
td->changed = true;
return 0;
}
......@@ -1703,7 +1705,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
}
td->mapped_blocks -= total_count;
td->changed = 1;
td->changed = true;
/*
* Reinsert the mapping tree.
......@@ -1841,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
* Care is taken to not have commit be what
* triggers putting the thin-pool in-service.
*/
__pmd_write_lock(pmd);
pmd_write_lock_in_core(pmd);
if (pmd->fail_io)
goto out;
......
......@@ -231,6 +231,7 @@ struct pool {
struct dm_target *ti; /* Only set if a pool target is bound */
struct mapped_device *pool_md;
struct block_device *data_dev;
struct block_device *md_dev;
struct dm_pool_metadata *pmd;
......@@ -281,6 +282,8 @@ struct pool {
struct dm_bio_prison_cell **cell_sort_array;
mempool_t mapping_pool;
struct bio flush_bio;
};
static void metadata_operation_failed(struct pool *pool, const char *op, int r);
......@@ -328,7 +331,6 @@ struct pool_c {
dm_block_t low_water_blocks;
struct pool_features requested_pf; /* Features requested during table load */
struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
struct bio flush_bio;
};
/*
......@@ -2924,6 +2926,7 @@ static void __pool_destroy(struct pool *pool)
if (pool->next_mapping)
mempool_free(pool->next_mapping, &pool->mapping_pool);
mempool_exit(&pool->mapping_pool);
bio_uninit(&pool->flush_bio);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
......@@ -2933,6 +2936,7 @@ static struct kmem_cache *_new_mapping_cache;
static struct pool *pool_create(struct mapped_device *pool_md,
struct block_device *metadata_dev,
struct block_device *data_dev,
unsigned long block_size,
int read_only, char **error)
{
......@@ -3003,6 +3007,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = false;
pool->suspended = true;
pool->out_of_data_space = false;
bio_init(&pool->flush_bio, NULL, 0);
pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) {
......@@ -3040,6 +3045,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
pool->md_dev = metadata_dev;
pool->data_dev = data_dev;
__pool_table_insert(pool);
return pool;
......@@ -3081,6 +3087,7 @@ static void __pool_dec(struct pool *pool)
static struct pool *__pool_find(struct mapped_device *pool_md,
struct block_device *metadata_dev,
struct block_device *data_dev,
unsigned long block_size, int read_only,
char **error, int *created)
{
......@@ -3091,19 +3098,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
*error = "metadata device already in use by a pool";
return ERR_PTR(-EBUSY);
}
if (pool->data_dev != data_dev) {
*error = "data device already in use by a pool";
return ERR_PTR(-EBUSY);
}
__pool_inc(pool);
} else {
pool = __pool_table_lookup(pool_md);
if (pool) {
if (pool->md_dev != metadata_dev) {
if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
*error = "different pool cannot replace a pool";
return ERR_PTR(-EINVAL);
}
__pool_inc(pool);
} else {
pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
*created = 1;
}
}
......@@ -3124,7 +3135,6 @@ static void pool_dtr(struct dm_target *ti)
__pool_dec(pt->pool);
dm_put_device(ti, pt->metadata_dev);
dm_put_device(ti, pt->data_dev);
bio_uninit(&pt->flush_bio);
kfree(pt);
mutex_unlock(&dm_thin_pool_table.mutex);
......@@ -3203,11 +3213,11 @@ static void metadata_low_callback(void *context)
*/
static int metadata_pre_commit_callback(void *context)
{
struct pool_c *pt = context;
struct bio *flush_bio = &pt->flush_bio;
struct pool *pool = context;
struct bio *flush_bio = &pool->flush_bio;
bio_reset(flush_bio);
bio_set_dev(flush_bio, pt->data_dev->bdev);
bio_set_dev(flush_bio, pool->data_dev);
flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
return submit_bio_wait(flush_bio);
......@@ -3356,7 +3366,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out;
}
pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
if (IS_ERR(pool)) {
r = PTR_ERR(pool);
......@@ -3381,7 +3391,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf;
bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1;
/*
......@@ -3408,9 +3417,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto out_flags_changed;
dm_pool_register_pre_commit_callback(pt->pool->pmd,
metadata_pre_commit_callback,
pt);
dm_pool_register_pre_commit_callback(pool->pmd,
metadata_pre_commit_callback, pool);
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
......@@ -4099,7 +4107,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
.version = {1, 21, 0},
.version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
......@@ -4476,7 +4484,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
.version = {1, 21, 0},
.version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
......
......@@ -611,8 +611,22 @@ static void verity_prefetch_io(struct work_struct *work)
static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
{
sector_t block = io->block;
unsigned int n_blocks = io->n_blocks;
struct dm_verity_prefetch_work *pw;
if (v->validated_blocks) {
while (n_blocks && test_bit(block, v->validated_blocks)) {
block++;
n_blocks--;
}
while (n_blocks && test_bit(block + n_blocks - 1,
v->validated_blocks))
n_blocks--;
if (!n_blocks)
return;
}
pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
......@@ -621,8 +635,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
INIT_WORK(&pw->work, verity_prefetch_io);
pw->v = v;
pw->block = io->block;
pw->n_blocks = io->n_blocks;
pw->block = block;
pw->n_blocks = n_blocks;
queue_work(v->verify_wq, &pw->work);
}
......
......@@ -442,7 +442,13 @@ static void writecache_notify_io(unsigned long error, void *context)
complete(&endio->c);
}
static void ssd_commit_flushed(struct dm_writecache *wc)
static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
{
wait_event(wc->bio_in_progress_wait[direction],
!atomic_read(&wc->bio_in_progress[direction]));
}
static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
struct dm_io_region region;
struct dm_io_request req;
......@@ -488,17 +494,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
writecache_notify_io(0, &endio);
wait_for_completion_io(&endio.c);
if (wait_for_ios)
writecache_wait_for_ios(wc, WRITE);
writecache_disk_flush(wc, wc->ssd_dev);
memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
}
static void writecache_commit_flushed(struct dm_writecache *wc)
static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
if (WC_MODE_PMEM(wc))
wmb();
else
ssd_commit_flushed(wc);
ssd_commit_flushed(wc, wait_for_ios);
}
static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
......@@ -522,12 +531,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
writecache_error(wc, r, "error flushing metadata: %d", r);
}
static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
{
wait_event(wc->bio_in_progress_wait[direction],
!atomic_read(&wc->bio_in_progress[direction]));
}
#define WFE_RETURN_FOLLOWING 1
#define WFE_LOWEST_SEQ 2
......@@ -622,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry
wc->freelist_size++;
}
static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
{
struct wc_entry *e;
......@@ -631,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(!wc->current_free))
return NULL;
e = wc->current_free;
if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
return NULL;
next = rb_next(&e->rb_node);
rb_erase(&e->rb_node, &wc->freetree);
if (unlikely(!next))
......@@ -640,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(list_empty(&wc->freelist)))
return NULL;
e = container_of(wc->freelist.next, struct wc_entry, lru);
if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
return NULL;
list_del(&e->lru);
}
wc->freelist_size--;
......@@ -724,15 +731,12 @@ static void writecache_flush(struct dm_writecache *wc)
e = e2;
cond_resched();
}
writecache_commit_flushed(wc);
if (!WC_MODE_PMEM(wc))
writecache_wait_for_ios(wc, WRITE);
writecache_commit_flushed(wc, true);
wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
wc->overwrote_committed = false;
......@@ -756,7 +760,7 @@ static void writecache_flush(struct dm_writecache *wc)
}
if (need_flush_after_free)
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
}
static void writecache_flush_work(struct work_struct *work)
......@@ -809,7 +813,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
}
if (discarded_something)
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
}
static bool writecache_wait_for_writeback(struct dm_writecache *wc)
......@@ -958,7 +962,7 @@ static void writecache_resume(struct dm_target *ti)
if (need_flush) {
writecache_flush_all_metadata(wc);
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
}
wc_unlock(wc);
......@@ -1193,7 +1197,7 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
goto bio_copy;
}
}
e = writecache_pop_from_freelist(wc);
e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) {
writecache_wait_on_freelist(wc);
continue;
......@@ -1205,9 +1209,26 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
if (WC_MODE_PMEM(wc)) {
bio_copy_block(wc, bio, memory_data(wc, e));
} else {
dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
unsigned bio_size = wc->block_size;
sector_t start_cache_sec = cache_sector(wc, e);
sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
while (bio_size < bio->bi_iter.bi_size) {
struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
if (!f)
break;
write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
(bio_size >> SECTOR_SHIFT), wc->seq_count);
writecache_insert_entry(wc, f);
wc->uncommitted_blocks++;
bio_size += wc->block_size;
current_cache_sec += wc->block_size >> SECTOR_SHIFT;
}
bio_set_dev(bio, wc->ssd_dev->bdev);
bio->bi_iter.bi_sector = cache_sector(wc, e);
bio->bi_iter.bi_sector = start_cache_sec;
dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
wc->uncommitted_blocks = 0;
queue_work(wc->writeback_wq, &wc->flush_work);
......@@ -1342,7 +1363,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *
wc->writeback_size--;
n_walked++;
if (unlikely(n_walked >= ENDIO_LATENCY)) {
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
wc_unlock(wc);
wc_lock(wc);
n_walked = 0;
......@@ -1423,7 +1444,7 @@ static int writecache_endio_thread(void *data)
writecache_wait_for_ios(wc, READ);
}
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
wc_unlock(wc);
}
......@@ -1766,10 +1787,10 @@ static int init_memory(struct dm_writecache *wc)
write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
writecache_flush_all_metadata(wc);
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
return 0;
}
......
......@@ -134,6 +134,7 @@ struct dmz_metadata {
sector_t zone_bitmap_size;
unsigned int zone_nr_bitmap_blocks;
unsigned int zone_bits_per_mblk;
unsigned int nr_bitmap_blocks;
unsigned int nr_map_blocks;
......@@ -1161,7 +1162,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
/* Init */
zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
zmd->zone_nr_bitmap_blocks =
max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
DMZ_BLOCK_SIZE_BITS);
/* Allocate zone array */
zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
......@@ -1956,7 +1960,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
dmz_release_mblock(zmd, to_mblk);
dmz_release_mblock(zmd, from_mblk);
chunk_block += DMZ_BLOCK_SIZE_BITS;
chunk_block += zmd->zone_bits_per_mblk;
}
to_zone->weight = from_zone->weight;
......@@ -2017,7 +2021,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Set bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
if (count) {
......@@ -2096,7 +2100,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Clear bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_clear_bits((unsigned long *)mblk->data,
bit, nr_bits);
......@@ -2156,6 +2160,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
{
struct dmz_mblock *mblk;
unsigned int bit, set_bit, nr_bits;
unsigned int zone_bits = zmd->zone_bits_per_mblk;
unsigned long *bitmap;
int n = 0;
......@@ -2170,15 +2175,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Get offset */
bitmap = (unsigned long *) mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
nr_bits = min(nr_blocks, zone_bits - bit);
if (set)
set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
set_bit = find_next_bit(bitmap, zone_bits, bit);
else
set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
dmz_release_mblock(zmd, mblk);
n += set_bit - bit;
if (set_bit < DMZ_BLOCK_SIZE_BITS)
if (set_bit < zone_bits)
break;
nr_blocks -= nr_bits;
......@@ -2281,7 +2286,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Count bits in this block */
bitmap = mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
n += dmz_count_bits(bitmap, bit, nr_bits);
dmz_release_mblock(zmd, mblk);
......
......@@ -1859,6 +1859,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md)
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
md->queue->backing_dev_info->congested_data = md;
md->queue->backing_dev_info->congested_fn = dm_any_congested;
}
......@@ -1949,7 +1950,12 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->queue)
goto bad;
md->queue->queuedata = md;
md->queue->backing_dev_info->congested_data = md;
/*
* default to bio-based required ->make_request_fn until DM
* table is loaded and md->type established. If request-based
* table is loaded: blk-mq will override accordingly.
*/
blk_queue_make_request(md->queue, dm_make_request);
md->disk = alloc_disk_node(1, md->numa_node_id);
if (!md->disk)
......@@ -2264,7 +2270,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
case DM_TYPE_DAX_BIO_BASED:
case DM_TYPE_NVME_BIO_BASED:
dm_init_normal_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
......
......@@ -380,6 +380,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
return -ENOSPC;
}
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
dm_block_t begin, dm_block_t end, dm_block_t *b)
{
int r;
uint32_t count;
do {
r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b);
if (r)
break;
/* double check this block wasn't used in the old transaction */
if (*b >= old_ll->nr_blocks)
count = 0;
else {
r = sm_ll_lookup(old_ll, *b, &count);
if (r)
break;
if (count)
begin = *b + 1;
}
} while (count);
return r;
}
static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
int (*mutator)(void *context, uint32_t old, uint32_t *new),
void *context, enum allocation_event *ev)
......
......@@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
dm_block_t end, dm_block_t *result);
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
dm_block_t begin, dm_block_t end, dm_block_t *result);
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
......
......@@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
/* FIXME: we should loop round a couple of times */
r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
/*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
if (r)
return r;
......
......@@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
/*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
if (r)
return r;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment