Commit 3e1a0699 authored by Joe Thornber's avatar Joe Thornber Committed by Mike Snitzer

dm thin: fix out of data space handling

Ideally a thin pool would never run out of data space; the low water
mark would trigger userland to extend the pool before we completely run
out of space.  However, many small random IOs to unprovisioned space can
consume data space at an alarming rate.  Adjust your low water mark if
you're frequently seeing "out-of-data-space" mode.

Before this fix, if data space ran out the pool would be put in
PM_READ_ONLY mode which also aborted the pool's current metadata
transaction (data loss for any changes in the transaction).  This had a
side-effect of needlessly compromising data consistency.  And retry of
queued unserviceable bios, once the data pool was resized, could
initiate changes to potentially inconsistent pool metadata.

Now when the pool's data space is exhausted transition to a new pool
mode (PM_OUT_OF_DATA_SPACE) that allows metadata to be changed but data
may not be allocated.  This allows users to remove thin volumes or
discard data to recover data space.

The pool is no longer put in PM_READ_ONLY mode in response to the pool
running out of data space.  And PM_READ_ONLY mode no longer aborts the
pool's current metadata transaction.  Also, set_pool_mode() will now
notify userspace when the pool mode is changed.
Signed-off-by: default avatarJoe Thornber <ejt@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@redhat.com>
parent 07f2b6e0
...@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, ...@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
struct dm_thin_new_mapping; struct dm_thin_new_mapping;
/* /*
* The pool runs in 3 modes. Ordered in degraded order for comparisons. * The pool runs in 4 modes. Ordered in degraded order for comparisons.
*/ */
enum pool_mode { enum pool_mode {
PM_WRITE, /* metadata may be changed */ PM_WRITE, /* metadata may be changed */
PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
PM_READ_ONLY, /* metadata may not be changed */ PM_READ_ONLY, /* metadata may not be changed */
PM_FAIL, /* all I/O fails */ PM_FAIL, /* all I/O fails */
}; };
...@@ -198,7 +199,6 @@ struct pool { ...@@ -198,7 +199,6 @@ struct pool {
}; };
static enum pool_mode get_pool_mode(struct pool *pool); static enum pool_mode get_pool_mode(struct pool *pool);
static void out_of_data_space(struct pool *pool);
static void metadata_operation_failed(struct pool *pool, const char *op, int r); static void metadata_operation_failed(struct pool *pool, const char *op, int r);
/* /*
...@@ -399,6 +399,23 @@ static void requeue_io(struct thin_c *tc) ...@@ -399,6 +399,23 @@ static void requeue_io(struct thin_c *tc)
spin_unlock_irqrestore(&pool->lock, flags); spin_unlock_irqrestore(&pool->lock, flags);
} }
static void error_retry_list(struct pool *pool)
{
struct bio *bio;
unsigned long flags;
struct bio_list bios;
bio_list_init(&bios);
spin_lock_irqsave(&pool->lock, flags);
bio_list_merge(&bios, &pool->retry_on_resume_list);
bio_list_init(&pool->retry_on_resume_list);
spin_unlock_irqrestore(&pool->lock, flags);
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio);
}
/* /*
* This section of code contains the logic for processing a thin device's IO. * This section of code contains the logic for processing a thin device's IO.
* Much of the code depends on pool object resources (lists, workqueues, etc) * Much of the code depends on pool object resources (lists, workqueues, etc)
...@@ -925,13 +942,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) ...@@ -925,13 +942,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
} }
} }
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
static int alloc_data_block(struct thin_c *tc, dm_block_t *result) static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
{ {
int r; int r;
dm_block_t free_blocks; dm_block_t free_blocks;
struct pool *pool = tc->pool; struct pool *pool = tc->pool;
if (get_pool_mode(pool) != PM_WRITE) if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
return -EINVAL; return -EINVAL;
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
...@@ -958,7 +977,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) ...@@ -958,7 +977,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
} }
if (!free_blocks) { if (!free_blocks) {
out_of_data_space(pool); set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
return -ENOSPC; return -ENOSPC;
} }
} }
...@@ -988,15 +1007,32 @@ static void retry_on_resume(struct bio *bio) ...@@ -988,15 +1007,32 @@ static void retry_on_resume(struct bio *bio)
spin_unlock_irqrestore(&pool->lock, flags); spin_unlock_irqrestore(&pool->lock, flags);
} }
static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) static bool should_error_unserviceable_bio(struct pool *pool)
{ {
/* enum pool_mode m = get_pool_mode(pool);
* When pool is read-only, no cell locking is needed because
* nothing is changing. switch (m) {
*/ case PM_WRITE:
WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY); /* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
return true;
case PM_OUT_OF_DATA_SPACE:
return pool->pf.error_if_no_space;
case PM_READ_ONLY:
case PM_FAIL:
return true;
default:
/* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
return true;
}
}
if (pool->pf.error_if_no_space) static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
{
if (should_error_unserviceable_bio(pool))
bio_io_error(bio); bio_io_error(bio);
else else
retry_on_resume(bio); retry_on_resume(bio);
...@@ -1007,11 +1043,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c ...@@ -1007,11 +1043,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
struct bio *bio; struct bio *bio;
struct bio_list bios; struct bio_list bios;
if (should_error_unserviceable_bio(pool)) {
cell_error(pool, cell);
return;
}
bio_list_init(&bios); bio_list_init(&bios);
cell_release(pool, cell, &bios); cell_release(pool, cell, &bios);
while ((bio = bio_list_pop(&bios))) if (should_error_unserviceable_bio(pool))
handle_unserviceable_bio(pool, bio); while ((bio = bio_list_pop(&bios)))
bio_io_error(bio);
else
while ((bio = bio_list_pop(&bios)))
retry_on_resume(bio);
} }
static void process_discard(struct thin_c *tc, struct bio *bio) static void process_discard(struct thin_c *tc, struct bio *bio)
...@@ -1296,6 +1341,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) ...@@ -1296,6 +1341,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
} }
} }
static void process_bio_success(struct thin_c *tc, struct bio *bio)
{
bio_endio(bio, 0);
}
static void process_bio_fail(struct thin_c *tc, struct bio *bio) static void process_bio_fail(struct thin_c *tc, struct bio *bio)
{ {
bio_io_error(bio); bio_io_error(bio);
...@@ -1399,9 +1449,15 @@ static enum pool_mode get_pool_mode(struct pool *pool) ...@@ -1399,9 +1449,15 @@ static enum pool_mode get_pool_mode(struct pool *pool)
return pool->pf.mode; return pool->pf.mode;
} }
static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
{
dm_table_event(pool->ti->table);
DMINFO("%s: switching pool to %s mode",
dm_device_name(pool->pool_md), new_mode);
}
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
{ {
int r;
struct pool_c *pt = pool->ti->private; struct pool_c *pt = pool->ti->private;
bool needs_check = dm_pool_metadata_needs_check(pool->pmd); bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
enum pool_mode old_mode = get_pool_mode(pool); enum pool_mode old_mode = get_pool_mode(pool);
...@@ -1429,38 +1485,48 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) ...@@ -1429,38 +1485,48 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
switch (new_mode) { switch (new_mode) {
case PM_FAIL: case PM_FAIL:
if (old_mode != new_mode) if (old_mode != new_mode)
DMERR("%s: switching pool to failure mode", notify_of_pool_mode_change(pool, "failure");
dm_device_name(pool->pool_md));
dm_pool_metadata_read_only(pool->pmd); dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_fail; pool->process_bio = process_bio_fail;
pool->process_discard = process_bio_fail; pool->process_discard = process_bio_fail;
pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_mapping = process_prepared_mapping_fail;
pool->process_prepared_discard = process_prepared_discard_fail; pool->process_prepared_discard = process_prepared_discard_fail;
error_retry_list(pool);
break; break;
case PM_READ_ONLY: case PM_READ_ONLY:
if (old_mode != new_mode) if (old_mode != new_mode)
DMERR("%s: switching pool to read-only mode", notify_of_pool_mode_change(pool, "read-only");
dm_device_name(pool->pool_md)); dm_pool_metadata_read_only(pool->pmd);
r = dm_pool_abort_metadata(pool->pmd); pool->process_bio = process_bio_read_only;
if (r) { pool->process_discard = process_bio_success;
DMERR("%s: aborting transaction failed", pool->process_prepared_mapping = process_prepared_mapping_fail;
dm_device_name(pool->pool_md)); pool->process_prepared_discard = process_prepared_discard_passdown;
new_mode = PM_FAIL;
set_pool_mode(pool, new_mode); error_retry_list(pool);
} else { break;
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_read_only; case PM_OUT_OF_DATA_SPACE:
pool->process_discard = process_discard; /*
pool->process_prepared_mapping = process_prepared_mapping_fail; * Ideally we'd never hit this state; the low water mark
pool->process_prepared_discard = process_prepared_discard_passdown; * would trigger userland to extend the pool before we
} * completely run out of data space. However, many small
* IOs to unprovisioned space can consume data space at an
* alarming rate. Adjust your low water mark if you're
* frequently seeing this mode.
*/
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "out-of-data-space");
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard;
pool->process_prepared_mapping = process_prepared_mapping;
pool->process_prepared_discard = process_prepared_discard_passdown;
break; break;
case PM_WRITE: case PM_WRITE:
if (old_mode != new_mode) if (old_mode != new_mode)
DMINFO("%s: switching pool to write mode", notify_of_pool_mode_change(pool, "write");
dm_device_name(pool->pool_md));
dm_pool_metadata_read_write(pool->pmd); dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio; pool->process_bio = process_bio;
pool->process_discard = process_discard; pool->process_discard = process_discard;
...@@ -1477,17 +1543,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) ...@@ -1477,17 +1543,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pt->adjusted_pf.mode = new_mode; pt->adjusted_pf.mode = new_mode;
} }
/*
* Rather than calling set_pool_mode directly, use these which describe the
* reason for mode degradation.
*/
static void out_of_data_space(struct pool *pool)
{
DMERR_LIMIT("%s: no free data space available.",
dm_device_name(pool->pool_md));
set_pool_mode(pool, PM_READ_ONLY);
}
static void abort_transaction(struct pool *pool) static void abort_transaction(struct pool *pool)
{ {
const char *dev_name = dm_device_name(pool->pool_md); const char *dev_name = dm_device_name(pool->pool_md);
...@@ -2719,7 +2774,9 @@ static void pool_status(struct dm_target *ti, status_type_t type, ...@@ -2719,7 +2774,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else else
DMEMIT("- "); DMEMIT("- ");
if (pool->pf.mode == PM_READ_ONLY) if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
DMEMIT("out_of_data_space ");
else if (pool->pf.mode == PM_READ_ONLY)
DMEMIT("ro "); DMEMIT("ro ");
else else
DMEMIT("rw "); DMEMIT("rw ");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment