Commit b0e5c294 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-4.19/dm-changes' of...

Merge tag 'for-4.19/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - A couple stable fixes for the DM writecache target.

 - A stable fix for the DM cache target that fixes the potential for
   data corruption after an unclean shutdown of a cache device using
   writeback mode.

 - Update DM integrity target to allow the metadata to be stored on a
   separate device from data.

 - Fix DM kcopyd and the snapshot target to cond_resched() where
   appropriate and be more efficient with processing completed work.

 - A few fixes and improvements for DM crypt.

 - Add DM delay target feature to configure delay of flushes independent
   of writes.

 - Update DM thin-provisioning target to include metadata_low_watermark
   threshold in pool status.

 - Fix stale DM thin-provisioning Documentation.

* tag 'for-4.19/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (26 commits)
  dm writecache: fix a crash due to reading past end of dirty_bitmap
  dm crypt: don't decrease device limits
  dm cache metadata: set dirty on all cache blocks after a crash
  dm snapshot: remove stale FIXME in snapshot_map()
  dm snapshot: improve performance by switching out_of_order_list to rbtree
  dm kcopyd: avoid softlockup in run_complete_job
  dm cache metadata: save in-core policy_hint_size to on-disk superblock
  dm thin: stop no_space_timeout worker when switching to write-mode
  dm kcopyd: return void from dm_kcopyd_copy()
  dm thin: include metadata_low_watermark threshold in pool status
  dm writecache: report start_sector in status line
  dm crypt: convert essiv from ahash to shash
  dm crypt: use wake_up_process() instead of a wait queue
  dm integrity: recalculate checksums on creation
  dm integrity: flush journal on suspend when using separate metadata device
  dm integrity: use version 2 for separate metadata
  dm integrity: allow separate metadata device
  dm integrity: add ic->start in get_data_sector()
  dm integrity: report provided data sectors in the status
  dm integrity: implement fair range locks
  ...
parents 2645b9d1 1e1132ea
......@@ -5,7 +5,8 @@ Device-Mapper's "delay" target delays reads and/or writes
and maps them to different devices.
Parameters:
<device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
<device> <offset> <delay> [<write_device> <write_offset> <write_delay>
[<flush_device> <flush_offset> <flush_delay>]]
With separate write parameters, the first set is only used for reads.
Offsets are specified in sectors.
......
......@@ -113,6 +113,10 @@ internal_hash:algorithm(:key) (the key is optional)
from an upper layer target, such as dm-crypt. The upper layer
target should check the validity of the integrity tags.
recalculate
Recalculate the integrity tags automatically. It is only valid
when using internal hash.
journal_crypt:algorithm(:key) (the key is optional)
Encrypt the journal using given algorithm to make sure that the
attacker can't read the journal. You can use a block cipher here
......
......@@ -28,17 +28,18 @@ administrator some freedom, for example to:
Status
======
These targets are very much still in the EXPERIMENTAL state. Please
do not yet rely on them in production. But do experiment and offer us
feedback. Different use cases will have different performance
characteristics, for example due to fragmentation of the data volume.
These targets are considered safe for production use. But different use
cases will have different performance characteristics, for example due
to fragmentation of the data volume.
If you find this software is not performing as expected please mail
dm-devel@redhat.com with details and we'll try our best to improve
things for you.
Userspace tools for checking and repairing the metadata are under
development.
Userspace tools for checking and repairing the metadata have been fully
developed and are available as 'thin_check' and 'thin_repair'. The name
of the package that provides these utilities varies by distribution (on
a Red Hat distribution it is named 'device-mapper-persistent-data').
Cookbook
========
......@@ -280,7 +281,7 @@ ii) Status
<transaction id> <used metadata blocks>/<total metadata blocks>
<used data blocks>/<total data blocks> <held metadata root>
ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space
needs_check|-
needs_check|- metadata_low_watermark
transaction id:
A 64-bit number used by userspace to help synchronise with metadata
......@@ -327,6 +328,11 @@ ii) Status
thin-pool can be made fully operational again. '-' indicates
needs_check is not set.
metadata_low_watermark:
Value of metadata low watermark in blocks. The kernel sets this
value internally but userspace needs to know this value to
determine if an event was caused by crossing this threshold.
iii) Messages
create_thin <dev id>
......
......@@ -363,7 +363,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->version = cpu_to_le32(cmd->version);
memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
disk_super->policy_hint_size = cpu_to_le32(0);
__copy_sm_root(cmd, disk_super);
......@@ -701,6 +701,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
disk_super->policy_hint_size = cpu_to_le32(cmd->policy_hint_size);
disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
......@@ -1322,6 +1323,7 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd,
dm_oblock_t oblock;
unsigned flags;
bool dirty = true;
dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
memcpy(&mapping, mapping_value_le, sizeof(mapping));
......@@ -1332,8 +1334,10 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd,
dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
memcpy(&hint, hint_value_le, sizeof(hint));
}
if (cmd->clean_when_opened)
dirty = flags & M_DIRTY;
r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY,
r = fn(context, oblock, to_cblock(cb), dirty,
le32_to_cpu(hint), hints_valid);
if (r) {
DMERR("policy couldn't load cache block %llu",
......@@ -1361,7 +1365,7 @@ static int __load_mapping_v2(struct dm_cache_metadata *cmd,
dm_oblock_t oblock;
unsigned flags;
bool dirty;
bool dirty = true;
dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
memcpy(&mapping, mapping_value_le, sizeof(mapping));
......@@ -1372,8 +1376,9 @@ static int __load_mapping_v2(struct dm_cache_metadata *cmd,
dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
memcpy(&hint, hint_value_le, sizeof(hint));
}
if (cmd->clean_when_opened)
dirty = dm_bitset_cursor_get_value(dirty_cursor);
r = fn(context, oblock, to_cblock(cb), dirty,
le32_to_cpu(hint), hints_valid);
if (r) {
......
......@@ -1188,9 +1188,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
queue_continuation(mg->cache->wq, &mg->k);
}
static int copy(struct dm_cache_migration *mg, bool promote)
static void copy(struct dm_cache_migration *mg, bool promote)
{
int r;
struct dm_io_region o_region, c_region;
struct cache *cache = mg->cache;
......@@ -1203,11 +1202,9 @@ static int copy(struct dm_cache_migration *mg, bool promote)
c_region.count = cache->sectors_per_block;
if (promote)
r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
else
r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
return r;
dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
}
static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
......@@ -1449,12 +1446,7 @@ static void mg_full_copy(struct work_struct *ws)
}
init_continuation(&mg->k, mg_upgrade_lock);
if (copy(mg, is_policy_promote)) {
DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
mg->k.input = BLK_STS_IOERR;
mg_complete(mg, false);
}
copy(mg, is_policy_promote);
}
static void mg_copy(struct work_struct *ws)
......@@ -2250,7 +2242,7 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
{0, 2, "Invalid number of cache feature arguments"},
};
int r;
int r, mode_ctr = 0;
unsigned argc;
const char *arg;
struct cache_features *cf = &ca->features;
......@@ -2264,14 +2256,20 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
while (argc--) {
arg = dm_shift_arg(as);
if (!strcasecmp(arg, "writeback"))
if (!strcasecmp(arg, "writeback")) {
cf->io_mode = CM_IO_WRITEBACK;
mode_ctr++;
}
else if (!strcasecmp(arg, "writethrough"))
else if (!strcasecmp(arg, "writethrough")) {
cf->io_mode = CM_IO_WRITETHROUGH;
mode_ctr++;
}
else if (!strcasecmp(arg, "passthrough"))
else if (!strcasecmp(arg, "passthrough")) {
cf->io_mode = CM_IO_PASSTHROUGH;
mode_ctr++;
}
else if (!strcasecmp(arg, "metadata2"))
cf->metadata_version = 2;
......@@ -2282,6 +2280,11 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
}
}
if (mode_ctr > 1) {
*error = "Duplicate cache io_mode features requested";
return -EINVAL;
}
return 0;
}
......
......@@ -99,7 +99,7 @@ struct crypt_iv_operations {
};
struct iv_essiv_private {
struct crypto_ahash *hash_tfm;
struct crypto_shash *hash_tfm;
u8 *salt;
};
......@@ -144,7 +144,7 @@ struct crypt_config {
struct workqueue_struct *io_queue;
struct workqueue_struct *crypt_queue;
wait_queue_head_t write_thread_wait;
spinlock_t write_thread_lock;
struct task_struct *write_thread;
struct rb_root write_tree;
......@@ -327,25 +327,22 @@ static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv,
static int crypt_iv_essiv_init(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
AHASH_REQUEST_ON_STACK(req, essiv->hash_tfm);
struct scatterlist sg;
SHASH_DESC_ON_STACK(desc, essiv->hash_tfm);
struct crypto_cipher *essiv_tfm;
int err;
sg_init_one(&sg, cc->key, cc->key_size);
ahash_request_set_tfm(req, essiv->hash_tfm);
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
ahash_request_set_crypt(req, &sg, essiv->salt, cc->key_size);
desc->tfm = essiv->hash_tfm;
desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_ahash_digest(req);
ahash_request_zero(req);
err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
shash_desc_zero(desc);
if (err)
return err;
essiv_tfm = cc->iv_private;
err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
crypto_ahash_digestsize(essiv->hash_tfm));
crypto_shash_digestsize(essiv->hash_tfm));
if (err)
return err;
......@@ -356,7 +353,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
static int crypt_iv_essiv_wipe(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
unsigned salt_size = crypto_ahash_digestsize(essiv->hash_tfm);
unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm);
struct crypto_cipher *essiv_tfm;
int r, err = 0;
......@@ -408,7 +405,7 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
struct crypto_cipher *essiv_tfm;
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
crypto_free_ahash(essiv->hash_tfm);
crypto_free_shash(essiv->hash_tfm);
essiv->hash_tfm = NULL;
kzfree(essiv->salt);
......@@ -426,7 +423,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
struct crypto_cipher *essiv_tfm = NULL;
struct crypto_ahash *hash_tfm = NULL;
struct crypto_shash *hash_tfm = NULL;
u8 *salt = NULL;
int err;
......@@ -436,14 +433,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
}
/* Allocate hash algorithm */
hash_tfm = crypto_alloc_ahash(opts, 0, CRYPTO_ALG_ASYNC);
hash_tfm = crypto_alloc_shash(opts, 0, 0);
if (IS_ERR(hash_tfm)) {
ti->error = "Error initializing ESSIV hash";
err = PTR_ERR(hash_tfm);
goto bad;
}
salt = kzalloc(crypto_ahash_digestsize(hash_tfm), GFP_KERNEL);
salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL);
if (!salt) {
ti->error = "Error kmallocing salt storage in ESSIV";
err = -ENOMEM;
......@@ -454,7 +451,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
crypto_ahash_digestsize(hash_tfm));
crypto_shash_digestsize(hash_tfm));
if (IS_ERR(essiv_tfm)) {
crypt_iv_essiv_dtr(cc);
return PTR_ERR(essiv_tfm);
......@@ -465,7 +462,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
bad:
if (hash_tfm && !IS_ERR(hash_tfm))
crypto_free_ahash(hash_tfm);
crypto_free_shash(hash_tfm);
kfree(salt);
return err;
}
......@@ -1620,36 +1617,31 @@ static int dmcrypt_write(void *data)
struct rb_root write_tree;
struct blk_plug plug;
DECLARE_WAITQUEUE(wait, current);
spin_lock_irq(&cc->write_thread_wait.lock);
spin_lock_irq(&cc->write_thread_lock);
continue_locked:
if (!RB_EMPTY_ROOT(&cc->write_tree))
goto pop_from_list;
set_current_state(TASK_INTERRUPTIBLE);
__add_wait_queue(&cc->write_thread_wait, &wait);
spin_unlock_irq(&cc->write_thread_wait.lock);
spin_unlock_irq(&cc->write_thread_lock);
if (unlikely(kthread_should_stop())) {
set_current_state(TASK_RUNNING);
remove_wait_queue(&cc->write_thread_wait, &wait);
break;
}
schedule();
set_current_state(TASK_RUNNING);
spin_lock_irq(&cc->write_thread_wait.lock);
__remove_wait_queue(&cc->write_thread_wait, &wait);
spin_lock_irq(&cc->write_thread_lock);
goto continue_locked;
pop_from_list:
write_tree = cc->write_tree;
cc->write_tree = RB_ROOT;
spin_unlock_irq(&cc->write_thread_wait.lock);
spin_unlock_irq(&cc->write_thread_lock);
BUG_ON(rb_parent(write_tree.rb_node));
......@@ -1693,7 +1685,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
return;
}
spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
spin_lock_irqsave(&cc->write_thread_lock, flags);
if (RB_EMPTY_ROOT(&cc->write_tree))
wake_up_process(cc->write_thread);
rbp = &cc->write_tree.rb_node;
parent = NULL;
sector = io->sector;
......@@ -1706,9 +1700,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
}
rb_link_node(&io->rb_node, parent, rbp);
rb_insert_color(&io->rb_node, &cc->write_tree);
wake_up_locked(&cc->write_thread_wait);
spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
spin_unlock_irqrestore(&cc->write_thread_lock, flags);
}
static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
......@@ -2831,7 +2823,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
init_waitqueue_head(&cc->write_thread_wait);
spin_lock_init(&cc->write_thread_lock);
cc->write_tree = RB_ROOT;
cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
......@@ -3069,11 +3061,11 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
*/
limits->max_segment_size = PAGE_SIZE;
if (cc->sector_size != (1 << SECTOR_SHIFT)) {
limits->logical_block_size = cc->sector_size;
limits->physical_block_size = cc->sector_size;
blk_limits_io_min(limits, cc->sector_size);
}
limits->logical_block_size =
max_t(unsigned short, limits->logical_block_size, cc->sector_size);
limits->physical_block_size =
max_t(unsigned, limits->physical_block_size, cc->sector_size);
limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size);
}
static struct target_type crypt_target = {
......
......@@ -17,6 +17,13 @@
#define DM_MSG_PREFIX "delay"
struct delay_class {
struct dm_dev *dev;
sector_t start;
unsigned delay;
unsigned ops;
};
struct delay_c {
struct timer_list delay_timer;
struct mutex timer_lock;
......@@ -25,19 +32,16 @@ struct delay_c {
struct list_head delayed_bios;
atomic_t may_delay;
struct dm_dev *dev_read;
sector_t start_read;
unsigned read_delay;
unsigned reads;
struct delay_class read;
struct delay_class write;
struct delay_class flush;
struct dm_dev *dev_write;
sector_t start_write;
unsigned write_delay;
unsigned writes;
int argc;
};
struct dm_delay_info {
struct delay_c *context;
struct delay_class *class;
struct list_head list;
unsigned long expires;
};
......@@ -77,7 +81,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
{
struct dm_delay_info *delayed, *next;
unsigned long next_expires = 0;
int start_timer = 0;
unsigned long start_timer = 0;
struct bio_list flush_bios = { };
mutex_lock(&delayed_bios_lock);
......@@ -87,10 +91,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
sizeof(struct dm_delay_info));
list_del(&delayed->list);
bio_list_add(&flush_bios, bio);
if ((bio_data_dir(bio) == WRITE))
delayed->context->writes--;
else
delayed->context->reads--;
delayed->class->ops--;
continue;
}
......@@ -100,7 +101,6 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
} else
next_expires = min(next_expires, delayed->expires);
}
mutex_unlock(&delayed_bios_lock);
if (start_timer)
......@@ -117,6 +117,50 @@ static void flush_expired_bios(struct work_struct *work)
flush_bios(flush_delayed_bios(dc, 0));
}
static void delay_dtr(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
destroy_workqueue(dc->kdelayd_wq);
if (dc->read.dev)
dm_put_device(ti, dc->read.dev);
if (dc->write.dev)
dm_put_device(ti, dc->write.dev);
if (dc->flush.dev)
dm_put_device(ti, dc->flush.dev);
mutex_destroy(&dc->timer_lock);
kfree(dc);
}
static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv)
{
int ret;
unsigned long long tmpll;
char dummy;
if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid device sector";
return -EINVAL;
}
c->start = tmpll;
if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) {
ti->error = "Invalid delay";
return -EINVAL;
}
ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev);
if (ret) {
ti->error = "Device lookup failed";
return ret;
}
return 0;
}
/*
* Mapping parameters:
* <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
......@@ -128,134 +172,89 @@ static void flush_expired_bios(struct work_struct *work)
static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct delay_c *dc;
unsigned long long tmpll;
char dummy;
int ret;
if (argc != 3 && argc != 6) {
ti->error = "Requires exactly 3 or 6 arguments";
if (argc != 3 && argc != 6 && argc != 9) {
ti->error = "Requires exactly 3, 6 or 9 arguments";
return -EINVAL;
}
dc = kmalloc(sizeof(*dc), GFP_KERNEL);
dc = kzalloc(sizeof(*dc), GFP_KERNEL);
if (!dc) {
ti->error = "Cannot allocate context";
return -ENOMEM;
}
dc->reads = dc->writes = 0;
ti->private = dc;
timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
INIT_LIST_HEAD(&dc->delayed_bios);
mutex_init(&dc->timer_lock);
atomic_set(&dc->may_delay, 1);
dc->argc = argc;
ret = -EINVAL;
if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid device sector";
ret = delay_class_ctr(ti, &dc->read, argv);
if (ret)
goto bad;
}
dc->start_read = tmpll;
if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
ti->error = "Invalid delay";
if (argc == 3) {
ret = delay_class_ctr(ti, &dc->write, argv);
if (ret)
goto bad;
}
ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
&dc->dev_read);
if (ret) {
ti->error = "Device lookup failed";
ret = delay_class_ctr(ti, &dc->flush, argv);
if (ret)
goto bad;
}
ret = -EINVAL;
dc->dev_write = NULL;
if (argc == 3)
goto out;
if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid write device sector";
goto bad_dev_read;
}
dc->start_write = tmpll;
if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
ti->error = "Invalid write delay";
goto bad_dev_read;
ret = delay_class_ctr(ti, &dc->write, argv + 3);
if (ret)
goto bad;
if (argc == 6) {
ret = delay_class_ctr(ti, &dc->flush, argv + 3);
if (ret)
goto bad;
goto out;
}
ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table),
&dc->dev_write);
if (ret) {
ti->error = "Write device lookup failed";
goto bad_dev_read;
}
ret = delay_class_ctr(ti, &dc->flush, argv + 6);
if (ret)
goto bad;
out:
ret = -EINVAL;
dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!dc->kdelayd_wq) {
ret = -EINVAL;
DMERR("Couldn't start kdelayd");
goto bad_queue;
goto bad;
}
timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
INIT_LIST_HEAD(&dc->delayed_bios);
mutex_init(&dc->timer_lock);
atomic_set(&dc->may_delay, 1);
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->per_io_data_size = sizeof(struct dm_delay_info);
ti->private = dc;
return 0;
bad_queue:
if (dc->dev_write)
dm_put_device(ti, dc->dev_write);
bad_dev_read:
dm_put_device(ti, dc->dev_read);
bad:
kfree(dc);
delay_dtr(ti);
return ret;
}
static void delay_dtr(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
destroy_workqueue(dc->kdelayd_wq);
dm_put_device(ti, dc->dev_read);
if (dc->dev_write)
dm_put_device(ti, dc->dev_write);
mutex_destroy(&dc->timer_lock);
kfree(dc);
}
static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
{
struct dm_delay_info *delayed;
unsigned long expires = 0;
if (!delay || !atomic_read(&dc->may_delay))
if (!c->delay || !atomic_read(&dc->may_delay))
return DM_MAPIO_REMAPPED;
delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
delayed->context = dc;
delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
mutex_lock(&delayed_bios_lock);
if (bio_data_dir(bio) == WRITE)
dc->writes++;
else
dc->reads++;
c->ops++;
list_add_tail(&delayed->list, &dc->delayed_bios);
mutex_unlock(&delayed_bios_lock);
queue_timeout(dc, expires);
......@@ -282,23 +281,28 @@ static void delay_resume(struct dm_target *ti)
static int delay_map(struct dm_target *ti, struct bio *bio)
{
struct delay_c *dc = ti->private;
struct delay_class *c;
struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
bio_set_dev(bio, dc->dev_write->bdev);
if (bio_sectors(bio))
bio->bi_iter.bi_sector = dc->start_write +
dm_target_offset(ti, bio->bi_iter.bi_sector);
return delay_bio(dc, dc->write_delay, bio);
if (bio_data_dir(bio) == WRITE) {
if (unlikely(bio->bi_opf & REQ_PREFLUSH))
c = &dc->flush;
else
c = &dc->write;
} else {
c = &dc->read;
}
delayed->class = c;
bio_set_dev(bio, c->dev->bdev);
if (bio_sectors(bio))
bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
bio_set_dev(bio, dc->dev_read->bdev);
bio->bi_iter.bi_sector = dc->start_read +
dm_target_offset(ti, bio->bi_iter.bi_sector);
return delay_bio(dc, dc->read_delay, bio);
return delay_bio(dc, c, bio);
}
#define DMEMIT_DELAY_CLASS(c) \
DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)
static void delay_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
......@@ -307,17 +311,19 @@ static void delay_status(struct dm_target *ti, status_type_t type,
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%u %u", dc->reads, dc->writes);
DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops);
break;
case STATUSTYPE_TABLE:
DMEMIT("%s %llu %u", dc->dev_read->name,
(unsigned long long) dc->start_read,
dc->read_delay);
if (dc->dev_write)
DMEMIT(" %s %llu %u", dc->dev_write->name,
(unsigned long long) dc->start_write,
dc->write_delay);
DMEMIT_DELAY_CLASS(&dc->read);
if (dc->argc >= 6) {
DMEMIT(" ");
DMEMIT_DELAY_CLASS(&dc->write);
}
if (dc->argc >= 9) {
DMEMIT(" ");
DMEMIT_DELAY_CLASS(&dc->flush);
}
break;
}
}
......@@ -328,12 +334,15 @@ static int delay_iterate_devices(struct dm_target *ti,
struct delay_c *dc = ti->private;
int ret = 0;
ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data);
if (ret)
goto out;
ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data);
if (ret)
goto out;
ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data);
if (ret)
goto out;
if (dc->dev_write)
ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
out:
return ret;
......
This diff is collapsed.
......@@ -487,6 +487,8 @@ static int run_complete_job(struct kcopyd_job *job)
if (atomic_dec_and_test(&kc->nr_jobs))
wake_up(&kc->destroyq);
cond_resched();
return 0;
}
......@@ -741,7 +743,7 @@ static void split_job(struct kcopyd_job *master_job)
}
}
int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
unsigned int num_dests, struct dm_io_region *dests,
unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
{
......@@ -818,16 +820,14 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->progress = 0;
split_job(job);
}
return 0;
}
EXPORT_SYMBOL(dm_kcopyd_copy);
int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
void dm_kcopyd_zero(struct dm_kcopyd_client *kc,
unsigned num_dests, struct dm_io_region *dests,
unsigned flags, dm_kcopyd_notify_fn fn, void *context)
{
return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
}
EXPORT_SYMBOL(dm_kcopyd_zero);
......
......@@ -326,9 +326,8 @@ static void recovery_complete(int read_err, unsigned long write_err,
dm_rh_recovery_end(reg, !(read_err || write_err));
}
static int recover(struct mirror_set *ms, struct dm_region *reg)
static void recover(struct mirror_set *ms, struct dm_region *reg)
{
int r;
unsigned i;
struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest;
struct mirror *m;
......@@ -367,10 +366,8 @@ static int recover(struct mirror_set *ms, struct dm_region *reg)
if (!errors_handled(ms))
set_bit(DM_KCOPYD_IGNORE_ERROR, &flags);
r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
flags, recovery_complete, reg);
return r;
}
static void reset_ms_flags(struct mirror_set *ms)
......@@ -388,7 +385,6 @@ static void do_recovery(struct mirror_set *ms)
{
struct dm_region *reg;
struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
int r;
/*
* Start quiescing some regions.
......@@ -398,11 +394,8 @@ static void do_recovery(struct mirror_set *ms)
/*
* Copy any already quiesced regions.
*/
while ((reg = dm_rh_recovery_start(ms->rh))) {
r = recover(ms, reg);
if (r)
dm_rh_recovery_end(reg, 0);
}
while ((reg = dm_rh_recovery_start(ms->rh)))
recover(ms, reg);
/*
* Update the in sync flag.
......
......@@ -85,7 +85,7 @@ struct dm_snapshot {
* A list of pending exceptions that completed out of order.
* Protected by kcopyd single-threaded callback.
*/
struct list_head out_of_order_list;
struct rb_root out_of_order_tree;
mempool_t pending_pool;
......@@ -200,7 +200,7 @@ struct dm_snap_pending_exception {
/* A sequence number, it is used for in-order completion. */
sector_t exception_sequence;
struct list_head out_of_order_entry;
struct rb_node out_of_order_node;
/*
* For writing a complete chunk, bypassing the copy.
......@@ -1173,7 +1173,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
atomic_set(&s->pending_exceptions_count, 0);
s->exception_start_sequence = 0;
s->exception_complete_sequence = 0;
INIT_LIST_HEAD(&s->out_of_order_list);
s->out_of_order_tree = RB_ROOT;
mutex_init(&s->lock);
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
......@@ -1539,28 +1539,41 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
pe->copy_error = read_err || write_err;
if (pe->exception_sequence == s->exception_complete_sequence) {
struct rb_node *next;
s->exception_complete_sequence++;
complete_exception(pe);
while (!list_empty(&s->out_of_order_list)) {
pe = list_entry(s->out_of_order_list.next,
struct dm_snap_pending_exception, out_of_order_entry);
next = rb_first(&s->out_of_order_tree);
while (next) {
pe = rb_entry(next, struct dm_snap_pending_exception,
out_of_order_node);
if (pe->exception_sequence != s->exception_complete_sequence)
break;
next = rb_next(next);
s->exception_complete_sequence++;
list_del(&pe->out_of_order_entry);
rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
complete_exception(pe);
cond_resched();
}
} else {
struct list_head *lh;
struct rb_node *parent = NULL;
struct rb_node **p = &s->out_of_order_tree.rb_node;
struct dm_snap_pending_exception *pe2;
list_for_each_prev(lh, &s->out_of_order_list) {
pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
if (pe2->exception_sequence < pe->exception_sequence)
break;
while (*p) {
pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
parent = *p;
BUG_ON(pe->exception_sequence == pe2->exception_sequence);
if (pe->exception_sequence < pe2->exception_sequence)
p = &((*p)->rb_left);
else
p = &((*p)->rb_right);
}
list_add(&pe->out_of_order_entry, lh);
rb_link_node(&pe->out_of_order_node, parent, p);
rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
}
}
......@@ -1694,8 +1707,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (!s->valid)
return DM_MAPIO_KILL;
/* FIXME: should only take write lock if we need
* to copy an exception */
mutex_lock(&s->lock);
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
......
......@@ -1220,18 +1220,13 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
sector_t begin, sector_t end)
{
int r;
struct dm_io_region to;
to.bdev = tc->pool_dev->bdev;
to.sector = begin;
to.count = end - begin;
r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
if (r < 0) {
DMERR_LIMIT("dm_kcopyd_zero() failed");
copy_complete(1, 1, m);
}
dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
}
static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
......@@ -1257,7 +1252,6 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_bio_prison_cell *cell, struct bio *bio,
sector_t len)
{
int r;
struct pool *pool = tc->pool;
struct dm_thin_new_mapping *m = get_next_mapping(pool);
......@@ -1296,19 +1290,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
to.sector = data_dest * pool->sectors_per_block;
to.count = len;
r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
dm_kcopyd_copy(pool->copier, &from, 1, &to,
0, copy_complete, m);
if (r < 0) {
DMERR_LIMIT("dm_kcopyd_copy() failed");
copy_complete(1, 1, m);
/*
* We allow the zero to be issued, to simplify the
* error path. Otherwise we'd need to start
* worrying about decrementing the prepare_actions
* counter.
*/
}
/*
* Do we need to zero a tail region?
......@@ -2520,6 +2503,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
case PM_WRITE:
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "write");
if (old_mode == PM_OUT_OF_DATA_SPACE)
cancel_delayed_work_sync(&pool->no_space_timeout);
pool->out_of_data_space = false;
pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
dm_pool_metadata_read_write(pool->pmd);
......@@ -3890,6 +3875,8 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("- ");
DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
break;
case STATUSTYPE_TABLE:
......@@ -3979,7 +3966,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
.version = {1, 19, 0},
.version = {1, 20, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
......@@ -4353,7 +4340,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
.version = {1, 19, 0},
.version = {1, 20, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
......
......@@ -457,7 +457,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
COMPLETION_INITIALIZER_ONSTACK(endio.c),
ATOMIC_INIT(1),
};
unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG;
unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
unsigned i = 0;
while (1) {
......@@ -2240,6 +2240,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
wc->dev->name, wc->ssd_dev->name, wc->block_size);
extra_args = 0;
if (wc->start_sector)
extra_args += 2;
if (wc->high_wm_percent_set)
extra_args += 2;
if (wc->low_wm_percent_set)
......@@ -2254,6 +2256,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
extra_args++;
DMEMIT("%u", extra_args);
if (wc->start_sector)
DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
if (wc->high_wm_percent_set) {
x = (uint64_t)wc->freelist_high_watermark * 100;
x += wc->n_blocks / 2;
......@@ -2280,7 +2284,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
static struct target_type writecache_target = {
.name = "writecache",
.version = {1, 1, 0},
.version = {1, 1, 1},
.module = THIS_MODULE,
.ctr = writecache_ctr,
.dtr = writecache_dtr,
......
......@@ -161,10 +161,8 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
/* Copy the valid region */
set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags);
ret = dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags,
dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags,
dmz_reclaim_kcopy_end, zrc);
if (ret)
return ret;
/* Wait for copy to complete */
wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY,
......
......@@ -62,7 +62,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc);
typedef void (*dm_kcopyd_notify_fn)(int read_err, unsigned long write_err,
void *context);
int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
unsigned num_dests, struct dm_io_region *dests,
unsigned flags, dm_kcopyd_notify_fn fn, void *context);
......@@ -81,7 +81,7 @@ void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
dm_kcopyd_notify_fn fn, void *context);
void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err);
int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
void dm_kcopyd_zero(struct dm_kcopyd_client *kc,
unsigned num_dests, struct dm_io_region *dests,
unsigned flags, dm_kcopyd_notify_fn fn, void *context);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment