Commit e9f8ca0a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.6/dm-changes' of...

Merge tag 'for-5.6/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Fix DM core's potential for q->make_request_fn NULL pointer in the
   unlikely case that a DM device is created without a DM table and then
   accessed due to upper-layer userspace code or user error.

 - Fix DM thin-provisioning's metadata_pre_commit_callback to not use
   memory after it is free'd. Also refactor code to disallow changing
   the thin-pool's data device once in use -- doing so guarantees smae
   lifetime of pool's data device relative to the pool metadata.

 - Fix DM space maps used by DM thinp and DM cache to avoid reuse of a
   already used block. This race was identified with extremely heavy
   snapshot use in the context of DM thin provisioning.

 - Fix DM raid's table status relative to an active rebuild.

 - Fix DM crypt to use GFP_NOIO rather than GFP_NOFS in call to
   skcipher_request_alloc(). Also fix benbi IV constructor crash if used
   in authenticated mode.

 - Add DM crypt support for Elephant diffuser to allow for Bitlocker
   compatibility.

 - Fix DM verity target to not prefetch hash blocks for data that has
   already been verified.

 - Fix DM writecache's incorrect flush sequence during commit when in
   SSD mode.

 - Improve DM writecache's sequential write performance on SSDs.

 - Add DM zoned target support for zone sizes smaller than 128MiB.

 - Add DM multipath 'queue_if_no_path_timeout_secs' module param to
   allow timeout if path isn't reinstated. This allows users a kernel
   safety-net against IO hanging indefinitely, due to no active paths,
   that has historically only been provided by multipathd userspace.

 - Various DM code cleanups to use true/false rather than 1/0, a
   variable rename in dm-dust, and fix for a math error in comment for
   DM thin metadata's ondisk format.

* tag 'for-5.6/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (21 commits)
  dm: fix potential for q->make_request_fn NULL pointer
  dm writecache: improve performance of large linear writes on SSDs
  dm mpath: Add timeout mechanism for queue_if_no_path
  dm thin: change data device's flush_bio to be member of struct pool
  dm thin: don't allow changing data device during thin-pool reload
  dm thin: fix use-after-free in metadata_pre_commit_callback
  dm thin metadata: use pool locking at end of dm_pool_metadata_close
  dm writecache: fix incorrect flush sequence when doing SSD mode commit
  dm crypt: fix benbi IV constructor crash if used in authenticated mode
  dm crypt: Implement Elephant diffuser for Bitlocker compatibility
  dm space map common: fix to ensure new block isn't already in use
  dm verity: don't prefetch hash blocks for already-verified data
  dm crypt: fix GFP flags passed to skcipher_request_alloc()
  dm thin metadata: Fix trivial math error in on-disk format documentation
  dm thin metadata: use true/false for bool variable
  dm snapshot: use true/false for bool variable
  dm bio prison v2: use true/false for bool variable
  dm mpath: use true/false for bool variable
  dm zoned: support zone sizes smaller than 128MiB
  dm raid: table line rebuild status fixes
  ...
parents 05ef8b97 47ace7e0
......@@ -419,3 +419,5 @@ Version History
rebuild errors.
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
pages allocated; also fix those not occuring after previous reductions
1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode)
on the status line.
......@@ -324,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
bio_list_init(&cell->bios);
if (cell->shared_count) {
cell->exclusive_lock = 0;
cell->exclusive_lock = false;
return false;
}
......
/*
* Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
* Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
* Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
* Copyright (C) 2006-2020 Red Hat, Inc. All rights reserved.
* Copyright (C) 2013-2020 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
*/
......@@ -115,6 +115,11 @@ struct iv_tcw_private {
u8 *whitening;
};
#define ELEPHANT_MAX_KEY_SIZE 32
struct iv_elephant_private {
struct crypto_skcipher *tfm;
};
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
......@@ -125,6 +130,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
enum cipher_flags {
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */
};
/*
......@@ -152,6 +158,7 @@ struct crypt_config {
struct iv_benbi_private benbi;
struct iv_lmk_private lmk;
struct iv_tcw_private tcw;
struct iv_elephant_private elephant;
} iv_gen_private;
u64 iv_offset;
unsigned int iv_size;
......@@ -285,6 +292,11 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
* eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode)
* The IV is encrypted little-endian byte-offset (with the same key
* and cipher as the volume).
*
* elephant: The extended version of eboiv with additional Elephant diffuser
* used with Bitlocker CBC mode.
* This mode was used in older Windows systems
* http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf
*/
static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
......@@ -331,8 +343,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
unsigned bs = crypto_skcipher_blocksize(any_tfm(cc));
int log = ilog2(bs);
unsigned bs;
int log;
if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags))
bs = crypto_aead_blocksize(any_tfm_aead(cc));
else
bs = crypto_skcipher_blocksize(any_tfm(cc));
log = ilog2(bs);
/* we need to calculate how far we must shift the sector count
* to get the cipher block count, we use this shift in _gen */
......@@ -717,7 +735,7 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
struct crypto_wait wait;
int err;
req = skcipher_request_alloc(any_tfm(cc), GFP_KERNEL | GFP_NOFS);
req = skcipher_request_alloc(any_tfm(cc), GFP_NOIO);
if (!req)
return -ENOMEM;
......@@ -734,6 +752,290 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
return err;
}
static void crypt_iv_elephant_dtr(struct crypt_config *cc)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
crypto_free_skcipher(elephant->tfm);
elephant->tfm = NULL;
}
static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
int r;
elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
if (IS_ERR(elephant->tfm)) {
r = PTR_ERR(elephant->tfm);
elephant->tfm = NULL;
return r;
}
r = crypt_iv_eboiv_ctr(cc, ti, NULL);
if (r)
crypt_iv_elephant_dtr(cc);
return r;
}
static void diffuser_disk_to_cpu(u32 *d, size_t n)
{
#ifndef __LITTLE_ENDIAN
int i;
for (i = 0; i < n; i++)
d[i] = le32_to_cpu((__le32)d[i]);
#endif
}
static void diffuser_cpu_to_disk(__le32 *d, size_t n)
{
#ifndef __LITTLE_ENDIAN
int i;
for (i = 0; i < n; i++)
d[i] = cpu_to_le32((u32)d[i]);
#endif
}
static void diffuser_a_decrypt(u32 *d, size_t n)
{
int i, i1, i2, i3;
for (i = 0; i < 5; i++) {
i1 = 0;
i2 = n - 2;
i3 = n - 5;
while (i1 < (n - 1)) {
d[i1] += d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
i1++; i2++; i3++;
if (i3 >= n)
i3 -= n;
d[i1] += d[i2] ^ d[i3];
i1++; i2++; i3++;
if (i2 >= n)
i2 -= n;
d[i1] += d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
i1++; i2++; i3++;
d[i1] += d[i2] ^ d[i3];
i1++; i2++; i3++;
}
}
}
static void diffuser_a_encrypt(u32 *d, size_t n)
{
int i, i1, i2, i3;
for (i = 0; i < 5; i++) {
i1 = n - 1;
i2 = n - 2 - 1;
i3 = n - 5 - 1;
while (i1 > 0) {
d[i1] -= d[i2] ^ d[i3];
i1--; i2--; i3--;
d[i1] -= d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
i1--; i2--; i3--;
if (i2 < 0)
i2 += n;
d[i1] -= d[i2] ^ d[i3];
i1--; i2--; i3--;
if (i3 < 0)
i3 += n;
d[i1] -= d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
i1--; i2--; i3--;
}
}
}
static void diffuser_b_decrypt(u32 *d, size_t n)
{
int i, i1, i2, i3;
for (i = 0; i < 3; i++) {
i1 = 0;
i2 = 2;
i3 = 5;
while (i1 < (n - 1)) {
d[i1] += d[i2] ^ d[i3];
i1++; i2++; i3++;
d[i1] += d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
i1++; i2++; i3++;
if (i2 >= n)
i2 -= n;
d[i1] += d[i2] ^ d[i3];
i1++; i2++; i3++;
if (i3 >= n)
i3 -= n;
d[i1] += d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
i1++; i2++; i3++;
}
}
}
static void diffuser_b_encrypt(u32 *d, size_t n)
{
int i, i1, i2, i3;
for (i = 0; i < 3; i++) {
i1 = n - 1;
i2 = 2 - 1;
i3 = 5 - 1;
while (i1 > 0) {
d[i1] -= d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
i1--; i2--; i3--;
if (i3 < 0)
i3 += n;
d[i1] -= d[i2] ^ d[i3];
i1--; i2--; i3--;
if (i2 < 0)
i2 += n;
d[i1] -= d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
i1--; i2--; i3--;
d[i1] -= d[i2] ^ d[i3];
i1--; i2--; i3--;
}
}
}
static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
u8 *es, *ks, *data, *data2, *data_offset;
struct skcipher_request *req;
struct scatterlist *sg, *sg2, src, dst;
struct crypto_wait wait;
int i, r;
req = skcipher_request_alloc(elephant->tfm, GFP_NOIO);
es = kzalloc(16, GFP_NOIO); /* Key for AES */
ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */
if (!req || !es || !ks) {
r = -ENOMEM;
goto out;
}
*(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
/* E(Ks, e(s)) */
sg_init_one(&src, es, 16);
sg_init_one(&dst, ks, 16);
skcipher_request_set_crypt(req, &src, &dst, 16, NULL);
skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
if (r)
goto out;
/* E(Ks, e'(s)) */
es[15] = 0x80;
sg_init_one(&dst, &ks[16], 16);
r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
if (r)
goto out;
sg = crypt_get_sg_data(cc, dmreq->sg_out);
data = kmap_atomic(sg_page(sg));
data_offset = data + sg->offset;
/* Cannot modify original bio, copy to sg_out and apply Elephant to it */
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
sg2 = crypt_get_sg_data(cc, dmreq->sg_in);
data2 = kmap_atomic(sg_page(sg2));
memcpy(data_offset, data2 + sg2->offset, cc->sector_size);
kunmap_atomic(data2);
}
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_b_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_a_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
}
for (i = 0; i < (cc->sector_size / 32); i++)
crypto_xor(data_offset + i * 32, ks, 32);
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_a_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_b_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
}
kunmap_atomic(data);
out:
kzfree(ks);
kzfree(es);
skcipher_request_free(req);
return r;
}
static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
int r;
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
r = crypt_iv_elephant(cc, dmreq);
if (r)
return r;
}
return crypt_iv_eboiv_gen(cc, iv, dmreq);
}
static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
return crypt_iv_elephant(cc, dmreq);
return 0;
}
static int crypt_iv_elephant_init(struct crypt_config *cc)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
int key_offset = cc->key_size - cc->key_extra_size;
return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size);
}
static int crypt_iv_elephant_wipe(struct crypt_config *cc)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
u8 key[ELEPHANT_MAX_KEY_SIZE];
memset(key, 0, cc->key_extra_size);
return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size);
}
static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
......@@ -787,6 +1089,15 @@ static struct crypt_iv_operations crypt_iv_eboiv_ops = {
.generator = crypt_iv_eboiv_gen
};
static struct crypt_iv_operations crypt_iv_elephant_ops = {
.ctr = crypt_iv_elephant_ctr,
.dtr = crypt_iv_elephant_dtr,
.init = crypt_iv_elephant_init,
.wipe = crypt_iv_elephant_wipe,
.generator = crypt_iv_elephant_gen,
.post = crypt_iv_elephant_post
};
/*
* Integrity extensions
*/
......@@ -1103,6 +1414,9 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
if (r < 0)
return r;
/* Data can be already preprocessed in generator */
if (test_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags))
sg_in = sg_out;
/* Store generated IV in integrity metadata */
if (cc->integrity_iv_size)
memcpy(tag_iv, org_iv, cc->integrity_iv_size);
......@@ -2191,7 +2505,14 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
cc->iv_gen_ops = &crypt_iv_null_ops;
else if (strcmp(ivmode, "eboiv") == 0)
cc->iv_gen_ops = &crypt_iv_eboiv_ops;
else if (strcmp(ivmode, "lmk") == 0) {
else if (strcmp(ivmode, "elephant") == 0) {
cc->iv_gen_ops = &crypt_iv_elephant_ops;
cc->key_parts = 2;
cc->key_extra_size = cc->key_size / 2;
if (cc->key_extra_size > ELEPHANT_MAX_KEY_SIZE)
return -EINVAL;
set_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags);
} else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops;
/*
* Version 2 and 3 is recognised according
......@@ -2959,7 +3280,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
.version = {1, 19, 0},
.version = {1, 20, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
......
......@@ -207,16 +207,16 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
int ret = DM_MAPIO_REMAPPED;
int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
ret = __dust_map_write(dd, thisblock);
r = __dust_map_write(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
return ret;
return r;
}
static int dust_map(struct dm_target *ti, struct bio *bio)
......
......@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <scsi/scsi_dh.h>
......@@ -29,6 +30,9 @@
#define DM_MSG_PREFIX "multipath"
#define DM_PG_INIT_DELAY_MSECS 2000
#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
/* Path properties */
struct pgpath {
......@@ -91,6 +95,8 @@ struct multipath {
struct work_struct process_queued_bios;
struct bio_list queued_bios;
struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
};
/*
......@@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work);
static void activate_or_offline_path(struct pgpath *pgpath);
static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work);
static void queue_if_no_path_timeout_work(struct timer_list *t);
/*-----------------------------------------------
* Multipath state flags.
......@@ -195,6 +202,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->ti = ti;
ti->private = m;
timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
}
return m;
......@@ -717,6 +726,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
return 0;
}
/*
* If the queue_if_no_path timeout fires, turn off queue_if_no_path and
* process any queued I/O.
*/
static void queue_if_no_path_timeout_work(struct timer_list *t)
{
struct multipath *m = from_timer(m, t, nopath_timer);
struct mapped_device *md = dm_table_get_md(m->ti->table);
DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
queue_if_no_path(m, false, false);
}
/*
* Enable the queue_if_no_path timeout if necessary.
* Called with m->lock held.
*/
static void enable_nopath_timeout(struct multipath *m)
{
unsigned long queue_if_no_path_timeout =
READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
lockdep_assert_held(&m->lock);
if (queue_if_no_path_timeout > 0 &&
atomic_read(&m->nr_valid_paths) == 0 &&
test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
mod_timer(&m->nopath_timer,
jiffies + queue_if_no_path_timeout);
}
}
static void disable_nopath_timeout(struct multipath *m)
{
del_timer_sync(&m->nopath_timer);
}
/*
* An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass.
......@@ -1090,6 +1136,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
unsigned long flags;
as.argc = argc;
as.argv = argv;
......@@ -1154,6 +1201,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
spin_lock_irqsave(&m->lock, flags);
enable_nopath_timeout(m);
spin_unlock_irqrestore(&m->lock, flags);
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
......@@ -1208,6 +1259,7 @@ static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = ti->private;
disable_nopath_timeout(m);
flush_multipath_work(m);
free_multipath(m);
}
......@@ -1241,6 +1293,8 @@ static int fail_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
enable_nopath_timeout(m);
out:
spin_unlock_irqrestore(&m->lock, flags);
......@@ -1291,6 +1345,9 @@ static int reinstate_path(struct pgpath *pgpath)
process_queued_io_list(m);
}
if (pgpath->is_active)
disable_nopath_timeout(m);
return r;
}
......@@ -1444,7 +1501,7 @@ static void pg_init_done(void *data, int errors)
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
delay_retry = 1;
delay_retry = true;
/* fall through */
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
......@@ -1789,6 +1846,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
struct dm_dev *dev;
struct multipath *m = ti->private;
action_fn action;
unsigned long flags;
mutex_lock(&m->work_mutex);
......@@ -1800,9 +1858,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, true, false);
spin_lock_irqsave(&m->lock, flags);
enable_nopath_timeout(m);
spin_unlock_irqrestore(&m->lock, flags);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, false, false);
disable_nopath_timeout(m);
goto out;
}
}
......@@ -2065,6 +2127,10 @@ static void __exit dm_multipath_exit(void)
module_init(dm_multipath_init);
module_exit(dm_multipath_exit);
module_param_named(queue_if_no_path_timeout_secs,
queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
MODULE_DESCRIPTION(DM_NAME " multipath target");
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
......@@ -129,7 +129,9 @@ struct raid_dev {
CTR_FLAG_RAID10_COPIES | \
CTR_FLAG_RAID10_FORMAT | \
CTR_FLAG_DELTA_DISKS | \
CTR_FLAG_DATA_OFFSET)
CTR_FLAG_DATA_OFFSET | \
CTR_FLAG_JOURNAL_DEV | \
CTR_FLAG_JOURNAL_MODE)
/* Valid options definitions per raid level... */
......@@ -3001,7 +3003,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{ 1, 254, "Cannot understand number of raid devices parameters" }
};
/* Must have <raid_type> */
arg = dm_shift_arg(&as);
if (!arg) {
ti->error = "No arguments";
......@@ -3508,8 +3509,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
unsigned long recovery;
unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
unsigned int sz = 0;
unsigned int rebuild_disks;
unsigned int write_mostly_params = 0;
unsigned int rebuild_writemostly_count = 0;
sector_t progress, resync_max_sectors, resync_mismatches;
enum sync_state state;
struct raid_type *rt;
......@@ -3593,18 +3593,20 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE:
/* Report the table line string you would use to construct this raid set */
/* Calculate raid parameter count */
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
write_mostly_params += 2;
rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks));
raid_param_cnt += rebuild_disks * 2 +
write_mostly_params +
/*
* Count any rebuild or writemostly argument pairs and subtract the
* hweight count being added below of any rebuild and writemostly ctr flags.
*/
for (i = 0; i < rs->raid_disks; i++) {
rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) +
(test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0);
}
rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) +
(test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0);
/* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */
raid_param_cnt += rebuild_writemostly_count +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
(test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
(test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
/* Emit table line */
/* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
......@@ -3612,11 +3614,10 @@ static void raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (rebuild_disks)
if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
rs->dev[i].rdev.raid_disk);
if (test_bit(i, (void *) rs->rebuild_disks))
DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i);
if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
mddev->bitmap_info.daemon_sleep);
......@@ -3626,7 +3627,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
mddev->sync_speed_max);
if (write_mostly_params)
if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
......@@ -4029,7 +4030,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
.version = {1, 15, 0},
.version = {1, 15, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
......
......@@ -1061,7 +1061,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
DMERR("Read error in exception store: "
"shutting down merge");
down_write(&s->lock);
s->merge_failed = 1;
s->merge_failed = true;
up_write(&s->lock);
}
goto shut;
......@@ -1149,7 +1149,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
shut:
down_write(&s->lock);
s->merge_failed = 1;
s->merge_failed = true;
b = __release_queued_bios_after_merge(s);
up_write(&s->lock);
error_bios(b);
......@@ -1314,7 +1314,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
s->merge_failed = 0;
s->merge_failed = false;
s->first_merging_chunk = 0;
s->num_merging_chunks = 0;
bio_list_init(&s->bios_queued_during_merge);
......
......@@ -28,7 +28,7 @@
*
* - A hierarchical btree, with 2 levels which effectively maps (thin
* dev id, virtual block) -> block_time. Block time is a 64-bit
* field holding the time in the low 24 bits, and block in the top 48
* field holding the time in the low 24 bits, and block in the top 40
* bits.
*
* BTrees consist solely of btree_nodes, that fill a block. Some are
......@@ -387,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
* Variant that is used for in-core only changes or code that
* shouldn't put the pool in service on its own (e.g. commit).
*/
static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
__acquires(pmd->root_lock)
{
down_write(&pmd->root_lock);
}
#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
{
__pmd_write_lock(pmd);
pmd_write_lock_in_core(pmd);
if (unlikely(!pmd->in_service))
pmd->in_service = true;
}
......@@ -811,7 +810,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
return r;
if (td->open_count)
td->changed = 0;
td->changed = false;
else {
list_del(&td->list);
kfree(td);
......@@ -831,6 +830,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
* We need to know if the thin_disk_superblock exceeds a 512-byte sector.
*/
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
BUG_ON(!rwsem_is_locked(&pmd->root_lock));
if (unlikely(!pmd->in_service))
return 0;
......@@ -953,6 +953,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return -EBUSY;
}
pmd_write_lock_in_core(pmd);
if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
r = __commit_transaction(pmd);
if (r < 0)
......@@ -961,6 +962,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
}
if (!pmd->fail_io)
__destroy_persistent_data_objects(pmd);
pmd_write_unlock(pmd);
kfree(pmd);
return 0;
......@@ -1106,7 +1108,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd,
if (r)
return r;
td->changed = 1;
td->changed = true;
td->snapshotted_time = time;
snap->mapped_blocks = td->mapped_blocks;
......@@ -1618,7 +1620,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
if (r)
return r;
td->changed = 1;
td->changed = true;
if (inserted)
td->mapped_blocks++;
......@@ -1649,7 +1651,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
return r;
td->mapped_blocks--;
td->changed = 1;
td->changed = true;
return 0;
}
......@@ -1703,7 +1705,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
}
td->mapped_blocks -= total_count;
td->changed = 1;
td->changed = true;
/*
* Reinsert the mapping tree.
......@@ -1841,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
* Care is taken to not have commit be what
* triggers putting the thin-pool in-service.
*/
__pmd_write_lock(pmd);
pmd_write_lock_in_core(pmd);
if (pmd->fail_io)
goto out;
......
......@@ -231,6 +231,7 @@ struct pool {
struct dm_target *ti; /* Only set if a pool target is bound */
struct mapped_device *pool_md;
struct block_device *data_dev;
struct block_device *md_dev;
struct dm_pool_metadata *pmd;
......@@ -281,6 +282,8 @@ struct pool {
struct dm_bio_prison_cell **cell_sort_array;
mempool_t mapping_pool;
struct bio flush_bio;
};
static void metadata_operation_failed(struct pool *pool, const char *op, int r);
......@@ -328,7 +331,6 @@ struct pool_c {
dm_block_t low_water_blocks;
struct pool_features requested_pf; /* Features requested during table load */
struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
struct bio flush_bio;
};
/*
......@@ -2924,6 +2926,7 @@ static void __pool_destroy(struct pool *pool)
if (pool->next_mapping)
mempool_free(pool->next_mapping, &pool->mapping_pool);
mempool_exit(&pool->mapping_pool);
bio_uninit(&pool->flush_bio);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
......@@ -2933,6 +2936,7 @@ static struct kmem_cache *_new_mapping_cache;
static struct pool *pool_create(struct mapped_device *pool_md,
struct block_device *metadata_dev,
struct block_device *data_dev,
unsigned long block_size,
int read_only, char **error)
{
......@@ -3003,6 +3007,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = false;
pool->suspended = true;
pool->out_of_data_space = false;
bio_init(&pool->flush_bio, NULL, 0);
pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) {
......@@ -3040,6 +3045,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
pool->md_dev = metadata_dev;
pool->data_dev = data_dev;
__pool_table_insert(pool);
return pool;
......@@ -3081,6 +3087,7 @@ static void __pool_dec(struct pool *pool)
static struct pool *__pool_find(struct mapped_device *pool_md,
struct block_device *metadata_dev,
struct block_device *data_dev,
unsigned long block_size, int read_only,
char **error, int *created)
{
......@@ -3091,19 +3098,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
*error = "metadata device already in use by a pool";
return ERR_PTR(-EBUSY);
}
if (pool->data_dev != data_dev) {
*error = "data device already in use by a pool";
return ERR_PTR(-EBUSY);
}
__pool_inc(pool);
} else {
pool = __pool_table_lookup(pool_md);
if (pool) {
if (pool->md_dev != metadata_dev) {
if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
*error = "different pool cannot replace a pool";
return ERR_PTR(-EINVAL);
}
__pool_inc(pool);
} else {
pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
*created = 1;
}
}
......@@ -3124,7 +3135,6 @@ static void pool_dtr(struct dm_target *ti)
__pool_dec(pt->pool);
dm_put_device(ti, pt->metadata_dev);
dm_put_device(ti, pt->data_dev);
bio_uninit(&pt->flush_bio);
kfree(pt);
mutex_unlock(&dm_thin_pool_table.mutex);
......@@ -3203,11 +3213,11 @@ static void metadata_low_callback(void *context)
*/
static int metadata_pre_commit_callback(void *context)
{
struct pool_c *pt = context;
struct bio *flush_bio = &pt->flush_bio;
struct pool *pool = context;
struct bio *flush_bio = &pool->flush_bio;
bio_reset(flush_bio);
bio_set_dev(flush_bio, pt->data_dev->bdev);
bio_set_dev(flush_bio, pool->data_dev);
flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
return submit_bio_wait(flush_bio);
......@@ -3356,7 +3366,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out;
}
pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
if (IS_ERR(pool)) {
r = PTR_ERR(pool);
......@@ -3381,7 +3391,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf;
bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1;
/*
......@@ -3408,9 +3417,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto out_flags_changed;
dm_pool_register_pre_commit_callback(pt->pool->pmd,
metadata_pre_commit_callback,
pt);
dm_pool_register_pre_commit_callback(pool->pmd,
metadata_pre_commit_callback, pool);
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
......@@ -4099,7 +4107,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
.version = {1, 21, 0},
.version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
......@@ -4476,7 +4484,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
.version = {1, 21, 0},
.version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
......
......@@ -611,8 +611,22 @@ static void verity_prefetch_io(struct work_struct *work)
static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
{
sector_t block = io->block;
unsigned int n_blocks = io->n_blocks;
struct dm_verity_prefetch_work *pw;
if (v->validated_blocks) {
while (n_blocks && test_bit(block, v->validated_blocks)) {
block++;
n_blocks--;
}
while (n_blocks && test_bit(block + n_blocks - 1,
v->validated_blocks))
n_blocks--;
if (!n_blocks)
return;
}
pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
......@@ -621,8 +635,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
INIT_WORK(&pw->work, verity_prefetch_io);
pw->v = v;
pw->block = io->block;
pw->n_blocks = io->n_blocks;
pw->block = block;
pw->n_blocks = n_blocks;
queue_work(v->verify_wq, &pw->work);
}
......
......@@ -442,7 +442,13 @@ static void writecache_notify_io(unsigned long error, void *context)
complete(&endio->c);
}
static void ssd_commit_flushed(struct dm_writecache *wc)
static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
{
wait_event(wc->bio_in_progress_wait[direction],
!atomic_read(&wc->bio_in_progress[direction]));
}
static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
struct dm_io_region region;
struct dm_io_request req;
......@@ -488,17 +494,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
writecache_notify_io(0, &endio);
wait_for_completion_io(&endio.c);
if (wait_for_ios)
writecache_wait_for_ios(wc, WRITE);
writecache_disk_flush(wc, wc->ssd_dev);
memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
}
static void writecache_commit_flushed(struct dm_writecache *wc)
static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
if (WC_MODE_PMEM(wc))
wmb();
else
ssd_commit_flushed(wc);
ssd_commit_flushed(wc, wait_for_ios);
}
static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
......@@ -522,12 +531,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
writecache_error(wc, r, "error flushing metadata: %d", r);
}
static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
{
wait_event(wc->bio_in_progress_wait[direction],
!atomic_read(&wc->bio_in_progress[direction]));
}
#define WFE_RETURN_FOLLOWING 1
#define WFE_LOWEST_SEQ 2
......@@ -622,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry
wc->freelist_size++;
}
static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
{
struct wc_entry *e;
......@@ -631,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(!wc->current_free))
return NULL;
e = wc->current_free;
if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
return NULL;
next = rb_next(&e->rb_node);
rb_erase(&e->rb_node, &wc->freetree);
if (unlikely(!next))
......@@ -640,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(list_empty(&wc->freelist)))
return NULL;
e = container_of(wc->freelist.next, struct wc_entry, lru);
if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
return NULL;
list_del(&e->lru);
}
wc->freelist_size--;
......@@ -724,15 +731,12 @@ static void writecache_flush(struct dm_writecache *wc)
e = e2;
cond_resched();
}
writecache_commit_flushed(wc);
if (!WC_MODE_PMEM(wc))
writecache_wait_for_ios(wc, WRITE);
writecache_commit_flushed(wc, true);
wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
wc->overwrote_committed = false;
......@@ -756,7 +760,7 @@ static void writecache_flush(struct dm_writecache *wc)
}
if (need_flush_after_free)
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
}
static void writecache_flush_work(struct work_struct *work)
......@@ -809,7 +813,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
}
if (discarded_something)
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
}
static bool writecache_wait_for_writeback(struct dm_writecache *wc)
......@@ -958,7 +962,7 @@ static void writecache_resume(struct dm_target *ti)
if (need_flush) {
writecache_flush_all_metadata(wc);
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
}
wc_unlock(wc);
......@@ -1193,7 +1197,7 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
goto bio_copy;
}
}
e = writecache_pop_from_freelist(wc);
e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) {
writecache_wait_on_freelist(wc);
continue;
......@@ -1205,9 +1209,26 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
if (WC_MODE_PMEM(wc)) {
bio_copy_block(wc, bio, memory_data(wc, e));
} else {
dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
unsigned bio_size = wc->block_size;
sector_t start_cache_sec = cache_sector(wc, e);
sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
while (bio_size < bio->bi_iter.bi_size) {
struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
if (!f)
break;
write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
(bio_size >> SECTOR_SHIFT), wc->seq_count);
writecache_insert_entry(wc, f);
wc->uncommitted_blocks++;
bio_size += wc->block_size;
current_cache_sec += wc->block_size >> SECTOR_SHIFT;
}
bio_set_dev(bio, wc->ssd_dev->bdev);
bio->bi_iter.bi_sector = cache_sector(wc, e);
bio->bi_iter.bi_sector = start_cache_sec;
dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
wc->uncommitted_blocks = 0;
queue_work(wc->writeback_wq, &wc->flush_work);
......@@ -1342,7 +1363,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *
wc->writeback_size--;
n_walked++;
if (unlikely(n_walked >= ENDIO_LATENCY)) {
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
wc_unlock(wc);
wc_lock(wc);
n_walked = 0;
......@@ -1423,7 +1444,7 @@ static int writecache_endio_thread(void *data)
writecache_wait_for_ios(wc, READ);
}
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
wc_unlock(wc);
}
......@@ -1766,10 +1787,10 @@ static int init_memory(struct dm_writecache *wc)
write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
writecache_flush_all_metadata(wc);
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
writecache_commit_flushed(wc);
writecache_commit_flushed(wc, false);
return 0;
}
......
......@@ -134,6 +134,7 @@ struct dmz_metadata {
sector_t zone_bitmap_size;
unsigned int zone_nr_bitmap_blocks;
unsigned int zone_bits_per_mblk;
unsigned int nr_bitmap_blocks;
unsigned int nr_map_blocks;
......@@ -1161,7 +1162,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
/* Init */
zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
zmd->zone_nr_bitmap_blocks =
max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
DMZ_BLOCK_SIZE_BITS);
/* Allocate zone array */
zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
......@@ -1956,7 +1960,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
dmz_release_mblock(zmd, to_mblk);
dmz_release_mblock(zmd, from_mblk);
chunk_block += DMZ_BLOCK_SIZE_BITS;
chunk_block += zmd->zone_bits_per_mblk;
}
to_zone->weight = from_zone->weight;
......@@ -2017,7 +2021,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Set bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
if (count) {
......@@ -2096,7 +2100,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Clear bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_clear_bits((unsigned long *)mblk->data,
bit, nr_bits);
......@@ -2156,6 +2160,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
{
struct dmz_mblock *mblk;
unsigned int bit, set_bit, nr_bits;
unsigned int zone_bits = zmd->zone_bits_per_mblk;
unsigned long *bitmap;
int n = 0;
......@@ -2170,15 +2175,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Get offset */
bitmap = (unsigned long *) mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
nr_bits = min(nr_blocks, zone_bits - bit);
if (set)
set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
set_bit = find_next_bit(bitmap, zone_bits, bit);
else
set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
dmz_release_mblock(zmd, mblk);
n += set_bit - bit;
if (set_bit < DMZ_BLOCK_SIZE_BITS)
if (set_bit < zone_bits)
break;
nr_blocks -= nr_bits;
......@@ -2281,7 +2286,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Count bits in this block */
bitmap = mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
n += dmz_count_bits(bitmap, bit, nr_bits);
dmz_release_mblock(zmd, mblk);
......
......@@ -1859,6 +1859,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md)
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
md->queue->backing_dev_info->congested_data = md;
md->queue->backing_dev_info->congested_fn = dm_any_congested;
}
......@@ -1949,7 +1950,12 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->queue)
goto bad;
md->queue->queuedata = md;
md->queue->backing_dev_info->congested_data = md;
/*
* default to bio-based required ->make_request_fn until DM
* table is loaded and md->type established. If request-based
* table is loaded: blk-mq will override accordingly.
*/
blk_queue_make_request(md->queue, dm_make_request);
md->disk = alloc_disk_node(1, md->numa_node_id);
if (!md->disk)
......@@ -2264,7 +2270,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
case DM_TYPE_DAX_BIO_BASED:
case DM_TYPE_NVME_BIO_BASED:
dm_init_normal_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
......
......@@ -380,6 +380,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
return -ENOSPC;
}
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
dm_block_t begin, dm_block_t end, dm_block_t *b)
{
int r;
uint32_t count;
do {
r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b);
if (r)
break;
/* double check this block wasn't used in the old transaction */
if (*b >= old_ll->nr_blocks)
count = 0;
else {
r = sm_ll_lookup(old_ll, *b, &count);
if (r)
break;
if (count)
begin = *b + 1;
}
} while (count);
return r;
}
static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
int (*mutator)(void *context, uint32_t old, uint32_t *new),
void *context, enum allocation_event *ev)
......
......@@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
dm_block_t end, dm_block_t *result);
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
dm_block_t begin, dm_block_t end, dm_block_t *result);
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
......
......@@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
/* FIXME: we should loop round a couple of times */
r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
/*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
if (r)
return r;
......
......@@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
/*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
if (r)
return r;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment