Merge tag 'dm-3.13-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer: "A set of device-mapper fixes for 3.13. A fix for possible memory corruption during DM table load, fix a possible leak of snapshot space in case of a crash, fix a possible deadlock due to a shared workqueue in the delay target, fix to initialize read-only module parameters that are used to export metrics for dm stats and dm bufio. Quite a few stable fixes were identified for both the thin- provisioning and caching targets as a result of increased regression testing using the device-mapper-test-suite (dmts). The most notable of these are the reference counting fixes for the space map btree that is used by the dm-array interface -- without these the dm-cache metadata will leak, resulting in dm-cache devices running out of metadata blocks. Also, some important fixes related to the thin-provisioning target's transition to read-only mode on error" * tag 'dm-3.13-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm array: fix a reference counting bug in shadow_ablock dm space map: disallow decrementing a reference count below zero dm stats: initialize read-only module parameter dm bufio: initialize read-only module parameters dm cache: actually resize cache dm cache: update Documentation for invalidate_cblocks's range syntax dm cache policy mq: fix promotions to occur as expected dm thin: allow pool in read-only mode to transition to read-write mode dm thin: re-establish read-only state when switching to fail mode dm thin: always fallback the pool mode if commit fails dm thin: switch to read-only mode if metadata space is exhausted dm thin: switch to read only mode if a mapping insert fails dm space map metadata: return on failure in sm_metadata_new_block dm table: fail dm_table_create on dm_round_up overflow dm snapshot: avoid snapshot space leak on crash dm delay: fix a possible deadlock due to shared workqueue

Merge tag 'dm-3.13-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper fixes from Mike Snitzer: "A set of device-mapper fixes for 3.13. A fix for possible memory corruption during DM table load, fix a possible leak of snapshot space in case of a crash, fix a possible deadlock due to a shared workqueue in the delay target, fix to initialize read-only module parameters that are used to export metrics for dm stats and dm bufio. Quite a few stable fixes were identified for both the thin- provisioning and caching targets as a result of increased regression testing using the device-mapper-test-suite (dmts). The most notable of these are the reference counting fixes for the space map btree that is used by the dm-array interface -- without these the dm-cache metadata will leak, resulting in dm-cache devices running out of metadata blocks. Also, some important fixes related to the thin-provisioning target's transition to read-only mode on error" * tag 'dm-3.13-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm array: fix a reference counting bug in shadow_ablock dm space map: disallow decrementing a reference count below zero dm stats: initialize read-only module parameter dm bufio: initialize read-only module parameters dm cache: actually resize cache dm cache: update Documentation for invalidate_cblocks's range syntax dm cache policy mq: fix promotions to occur as expected dm thin: allow pool in read-only mode to transition to read-write mode dm thin: re-establish read-only state when switching to fail mode dm thin: always fallback the pool mode if commit fails dm thin: switch to read-only mode if metadata space is exhausted dm thin: switch to read only mode if a mapping insert fails dm space map metadata: return on failure in sm_metadata_new_block dm table: fail dm_table_create on dm_round_up overflow dm snapshot: avoid snapshot space leak on crash dm delay: fix a possible deadlock due to shared workqueue
93e1585e · Linus Torvalds · 1008ebb6 · ed9571f0 · 93e1585e · 93e1585e
Commit 93e1585e authored Dec 13, 2013 by Linus Torvalds
16 changed files
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -266,10 +266,12 @@ E.g.
 Invalidation is removing an entry from the cache without writing it
 back.  Cache blocks can be invalidated via the invalidate_cblocks
 message, which takes an arbitrary number of cblock ranges.  Each cblock
-must be expressed as a decimal value, in the future a variant message
+range's end value is "one past the end", meaning 5-10 expresses a range
-that takes cblock ranges expressed in hexidecimal may be needed to
+of values from 5 to 9.  Each cblock must be expressed as a decimal
-better support efficient invalidation of larger caches.  The cache must
+value, in the future a variant message that takes cblock ranges
-be in passthrough mode when invalidate_cblocks is used.
+expressed in hexidecimal may be needed to better support efficient
+invalidation of larger caches.  The cache must be in passthrough mode
+when invalidate_cblocks is used.
   invalidate_cblocks [<cblock>|<cblock begin>-<cblock end>]*

--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void)
 {
 	__u64 mem;
+	dm_bufio_allocated_kmem_cache = 0;
+	dm_bufio_allocated_get_free_pages = 0;
+	dm_bufio_allocated_vmalloc = 0;
+	dm_bufio_current_allocated = 0;
 	memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
 	memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);

--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 	int r = 0;
 	bool updated = updated_this_tick(mq, e);
-	requeue_and_update_tick(mq, e);
 	if ((!discarded_oblock && updated) ||
-	    !should_promote(mq, e, discarded_oblock, data_dir))
+	    !should_promote(mq, e, discarded_oblock, data_dir)) {
+		requeue_and_update_tick(mq, e);
 		result->op = POLICY_MISS;
-	else if (!can_migrate)
+	} else if (!can_migrate)
 		r = -EWOULDBLOCK;
-	else
+	else {
+		requeue_and_update_tick(mq, e);
 		r = pre_cache_to_cache(mq, e, result);
+	}
 	return r;
 }

--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 {
 	int r;
-	r = dm_cache_resize(cache->cmd, cache->cache_size);
+	r = dm_cache_resize(cache->cmd, new_size);
 	if (r) {
 		DMERR("could not resize cache metadata");
 		return r;

--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -20,6 +20,7 @@
 struct delay_c {
 	struct timer_list delay_timer;
 	struct mutex timer_lock;
+	struct workqueue_struct *kdelayd_wq;
 	struct work_struct flush_expired_bios;
 	struct list_head delayed_bios;
 	atomic_t may_delay;
@@ -45,14 +46,13 @@ struct dm_delay_info {
 static DEFINE_MUTEX(delayed_bios_lock);
-static struct workqueue_struct *kdelayd_wq;
 static struct kmem_cache *delayed_cache;
 static void handle_delayed_timer(unsigned long data)
 {
 	struct delay_c *dc = (struct delay_c *)data;
-	queue_work(kdelayd_wq, &dc->flush_expired_bios);
+	queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
 }
 static void queue_timeout(struct delay_c *dc, unsigned long expires)
@@ -191,6 +191,12 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_dev_write;
 	}
+	dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
+	if (!dc->kdelayd_wq) {
+		DMERR("Couldn't start kdelayd");
+		goto bad_queue;
+	}
 	setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
 	INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
@@ -203,6 +209,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->private = dc;
 	return 0;
+bad_queue:
+	mempool_destroy(dc->delayed_pool);
 bad_dev_write:
 	if (dc->dev_write)
 		dm_put_device(ti, dc->dev_write);
@@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
-	flush_workqueue(kdelayd_wq);
+	destroy_workqueue(dc->kdelayd_wq);
 	dm_put_device(ti, dc->dev_read);
@@ -350,12 +358,6 @@ static int __init dm_delay_init(void)
 {
 	int r = -ENOMEM;
-	kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
-	if (!kdelayd_wq) {
-		DMERR("Couldn't start kdelayd");
-		goto bad_queue;
-	}
 	delayed_cache = KMEM_CACHE(dm_delay_info, 0);
 	if (!delayed_cache) {
 		DMERR("Couldn't create delayed bio cache.");
@@ -373,8 +375,6 @@ static int __init dm_delay_init(void)
 bad_register:
 	kmem_cache_destroy(delayed_cache);
 bad_memcache:
-	destroy_workqueue(kdelayd_wq);
-bad_queue:
 	return r;
 }
@@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void)
 {
 	dm_unregister_target(&delay_target);
 	kmem_cache_destroy(delayed_cache);
-	destroy_workqueue(kdelayd_wq);
 }
 /* Module hooks */

--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -66,6 +66,18 @@ struct dm_snapshot {
 	atomic_t pending_exceptions_count;
+	/* Protected by "lock" */
+	sector_t exception_start_sequence;
+	/* Protected by kcopyd single-threaded callback */
+	sector_t exception_complete_sequence;
+	/*
+	 * A list of pending exceptions that completed out of order.
+	 * Protected by kcopyd single-threaded callback.
+	 */
+	struct list_head out_of_order_list;
 	mempool_t *pending_pool;
 	struct dm_exception_table pending;
@@ -173,6 +185,14 @@ struct dm_snap_pending_exception {
 	 */
 	int started;
+	/* There was copying error. */
+	int copy_error;
+	/* A sequence number, it is used for in-order completion. */
+	sector_t exception_sequence;
+	struct list_head out_of_order_entry;
 	/*
 	 * For writing a complete chunk, bypassing the copy.
 	 */
@@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->valid = 1;
 	s->active = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
+	s->exception_start_sequence = 0;
+	s->exception_complete_sequence = 0;
+	INIT_LIST_HEAD(&s->out_of_order_list);
 	init_rwsem(&s->lock);
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
@@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success)
 	pending_complete(pe, success);
 }
+static void complete_exception(struct dm_snap_pending_exception *pe)
+{
+	struct dm_snapshot *s = pe->snap;
+	if (unlikely(pe->copy_error))
+		pending_complete(pe, 0);
+	else
+		/* Update the metadata if we are persistent */
+		s->store->type->commit_exception(s->store, &pe->e,
+						 commit_callback, pe);
+}
 /*
 * Called when the copy I/O has finished.  kcopyd actually runs
 * this code so don't block.
@@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
 	struct dm_snap_pending_exception *pe = context;
 	struct dm_snapshot *s = pe->snap;
-	if (read_err || write_err)
+	pe->copy_error = read_err || write_err;
-		pending_complete(pe, 0);
-	else
+	if (pe->exception_sequence == s->exception_complete_sequence) {
-		/* Update the metadata if we are persistent */
+		s->exception_complete_sequence++;
-		s->store->type->commit_exception(s->store, &pe->e,
+		complete_exception(pe);
-						 commit_callback, pe);
+		while (!list_empty(&s->out_of_order_list)) {
+			pe = list_entry(s->out_of_order_list.next,
+					struct dm_snap_pending_exception, out_of_order_entry);
+			if (pe->exception_sequence != s->exception_complete_sequence)
+				break;
+			s->exception_complete_sequence++;
+			list_del(&pe->out_of_order_entry);
+			complete_exception(pe);
+		}
+	} else {
+		struct list_head *lh;
+		struct dm_snap_pending_exception *pe2;
+		list_for_each_prev(lh, &s->out_of_order_list) {
+			pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
+			if (pe2->exception_sequence < pe->exception_sequence)
+				break;
+		}
+		list_add(&pe->out_of_order_entry, lh);
+	}
 }
 /*
@@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s,
 		return NULL;
 	}
+	pe->exception_sequence = s->exception_start_sequence++;
 	dm_insert_exception(&s->pending, &pe->e);
 	return pe;
@@ -2192,7 +2249,7 @@ static struct target_type origin_target = {
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 11, 1},
+	.version = {1, 12, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,

--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 int __init dm_statistics_init(void)
 {
+	shared_memory_amount = 0;
 	dm_stat_need_rcu_barrier = 0;
 	return 0;
 }

--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 	num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
+	if (!num_targets) {
+		kfree(t);
+		return -ENOMEM;
+	}
 	if (alloc_targets(t, num_targets)) {
 		kfree(t);
 		return -ENOMEM;

--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 	up_write(&pmd->root_lock);
 }
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
+{
+	down_write(&pmd->root_lock);
+	pmd->read_only = false;
+	dm_bm_set_read_write(pmd->bm);
+	up_write(&pmd->root_lock);
+}
 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 					dm_block_t threshold,
 					dm_sm_threshold_fn fn,

--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz
 * that nothing is changing.
 */
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
+void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd);
 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 					dm_block_t threshold,

--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 */
 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 	if (r) {
-		DMERR_LIMIT("dm_thin_insert_block() failed");
+		DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
+			    dm_device_name(pool->pool_md), r);
+		set_pool_mode(pool, PM_READ_ONLY);
 		cell_error(pool, m->cell);
 		goto out;
 	}
@@ -881,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	}
 }
-static int commit(struct pool *pool)
-{
-	int r;
-	r = dm_pool_commit_metadata(pool->pmd);
-	if (r)
-		DMERR_LIMIT("%s: commit failed: error = %d",
-			    dm_device_name(pool->pool_md), r);
-	return r;
-}
 /*
 * A non-zero return indicates read_only or fail_io mode.
 * Many callers don't care about the return value.
 */
-static int commit_or_fallback(struct pool *pool)
+static int commit(struct pool *pool)
 {
 	int r;
 	if (get_pool_mode(pool) != PM_WRITE)
 		return -EINVAL;
-	r = commit(pool);
+	r = dm_pool_commit_metadata(pool->pmd);
-	if (r)
+	if (r) {
+		DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
+			    dm_device_name(pool->pool_md), r);
 		set_pool_mode(pool, PM_READ_ONLY);
+	}
 	return r;
 }
@@ -943,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		 * Try to commit to see if that will free up some
 		 * more space.
 		 */
-		(void) commit_or_fallback(pool);
+		r = commit(pool);
+		if (r)
+			return r;
 		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 		if (r)
@@ -957,7 +952,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		 * table reload).
 		 */
 		if (!free_blocks) {
-			DMWARN("%s: no free space available.",
+			DMWARN("%s: no free data space available.",
 			       dm_device_name(pool->pool_md));
 			spin_lock_irqsave(&pool->lock, flags);
 			pool->no_free_space = 1;
@@ -967,8 +962,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 	}
 	r = dm_pool_alloc_data_block(pool->pmd, result);
-	if (r)
+	if (r) {
+		if (r == -ENOSPC &&
+		    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
+		    !free_blocks) {
+			DMWARN("%s: no free metadata space available.",
+			       dm_device_name(pool->pool_md));
+			set_pool_mode(pool, PM_READ_ONLY);
+		}
 		return r;
+	}
 	return 0;
 }
@@ -1349,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool)
 	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
 		return;
-	if (commit_or_fallback(pool)) {
+	if (commit(pool)) {
 		while ((bio = bio_list_pop(&bios)))
 			bio_io_error(bio);
 		return;
@@ -1397,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 	case PM_FAIL:
 		DMERR("%s: switching pool to failure mode",
 		      dm_device_name(pool->pool_md));
+		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
@@ -1421,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 		break;
 	case PM_WRITE:
+		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard;
 		pool->process_prepared_mapping = process_prepared_mapping;
@@ -1637,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	struct pool_c *pt = ti->private;
 	/*
-	 * We want to make sure that degraded pools are never upgraded.
+	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
 	 */
 	enum pool_mode old_mode = pool->pf.mode;
 	enum pool_mode new_mode = pt->adjusted_pf.mode;
-	if (old_mode > new_mode)
+	/*
+	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+	 * not going to recover without a thin_repair.  So we never let the
+	 * pool move out of the old mode.  On the other hand a PM_READ_ONLY
+	 * may have been due to a lack of metadata or data space, and may
+	 * now work (ie. if the underlying devices have been resized).
+	 */
+	if (old_mode == PM_FAIL)
 		new_mode = old_mode;
 	pool->ti = ti;
@@ -2266,7 +2278,7 @@ static int pool_preresume(struct dm_target *ti)
 		return r;
 	if (need_commit1 || need_commit2)
-		(void) commit_or_fallback(pool);
+		(void) commit(pool);
 	return 0;
 }
@@ -2293,7 +2305,7 @@ static void pool_postsuspend(struct dm_target *ti)
 	cancel_delayed_work(&pool->waker);
 	flush_workqueue(pool->wq);
-	(void) commit_or_fallback(pool);
+	(void) commit(pool);
 }
 static int check_arg_count(unsigned argc, unsigned args_required)
@@ -2427,7 +2439,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
 	if (r)
 		return r;
-	(void) commit_or_fallback(pool);
+	(void) commit(pool);
 	r = dm_pool_reserve_metadata_snap(pool->pmd);
 	if (r)
@@ -2489,7 +2501,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 	if (!r)
-		(void) commit_or_fallback(pool);
+		(void) commit(pool);
 	return r;
 }
@@ -2544,7 +2556,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
 		/* Commit to ensure statistics aren't out-of-date */
 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
-			(void) commit_or_fallback(pool);
+			(void) commit(pool);
 		r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
 		if (r) {

--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
 	 * The shadow op will often be a noop.  Only insert if it really
 	 * copied data.
 	 */
-	if (dm_block_location(*block) != b)
+	if (dm_block_location(*block) != b) {
+		/*
+		 * dm_tm_shadow_block will have already decremented the old
+		 * block, but it is still referenced by the btree.  We
+		 * increment to stop the insert decrementing it below zero
+		 * when overwriting the old value.
+		 */
+		dm_tm_inc(info->btree_info.tm, b);
 		r = insert_ablock(info, index, *block, root);
+	}
 	return r;
 }

--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm)
 }
 EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
+void dm_bm_set_read_write(struct dm_block_manager *bm)
+{
+	bm->read_only = false;
+}
+EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
 {
 	return crc32c(~(u32) 0, data, len) ^ init_xor;

--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b);
 int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
 			   struct dm_block *superblock);
- /*
+/*
-  * Request data be prefetched into the cache.
+ * Request data is prefetched into the cache.
-  */
+ */
 void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
 /*
@@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
 * be returned if you do.
 */
 void dm_bm_set_read_only(struct dm_block_manager *bm);
+void dm_bm_set_read_write(struct dm_block_manager *bm);
 u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);

--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
 }
 static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
-			uint32_t (*mutator)(void *context, uint32_t old),
+			int (*mutator)(void *context, uint32_t old, uint32_t *new),
 			void *context, enum allocation_event *ev)
 {
 	int r;
@@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
 	if (old > 2) {
 		r = sm_ll_lookup_big_ref_count(ll, b, &old);
-		if (r < 0)
+		if (r < 0) {
+			dm_tm_unlock(ll->tm, nb);
 			return r;
+		}
 	}
-	ref_count = mutator(context, old);
+	r = mutator(context, old, &ref_count);
+	if (r) {
+		dm_tm_unlock(ll->tm, nb);
+		return r;
+	}
 	if (ref_count <= 2) {
 		sm_set_bitmap(bm_le, bit, ref_count);
@@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
 	return ll->save_ie(ll, index, &ie_disk);
 }
-static uint32_t set_ref_count(void *context, uint32_t old)
+static int set_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	return *((uint32_t *) context);
+	*new = *((uint32_t *) context);
+	return 0;
 }
 int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
@@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
 	return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
 }
-static uint32_t inc_ref_count(void *context, uint32_t old)
+static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	return old + 1;
+	*new = old + 1;
+	return 0;
 }
 int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
@@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
 	return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
 }
-static uint32_t dec_ref_count(void *context, uint32_t old)
+static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
 {
-	return old - 1;
+	if (!old) {
+		DMERR_LIMIT("unable to decrement a reference count below 0");
+		return -EINVAL;
+	}
+	*new = old - 1;
+	return 0;
 }
 int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)

--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	int r = sm_metadata_new_block_(sm, b);
-	if (r)
+	if (r) {
 		DMERR("unable to allocate new metadata block");
+		return r;
+	}
 	r = sm_metadata_get_nr_free(sm, &count);
-	if (r)
+	if (r) {
 		DMERR("couldn't get free block count");
+		return r;
+	}
 	check_threshold(&smm->threshold, count);