Commit a0efc03b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-4.19/dm-fixes' of...

Merge tag 'for-4.19/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer:

 - DM verity fix for crash due to using vmalloc'd buffers with the
   asynchronous crypto hadsh API.

 - Fix to both DM crypt and DM integrity targets to discontinue using
   CRYPTO_TFM_REQ_MAY_SLEEP because its use of GFP_KERNEL can lead to
   deadlock by recursing back into a filesystem.

 - Various DM raid fixes related to reshape and rebuild races.

 - Fix for DM thin-provisioning to avoid data corruption that was a
   side-effect of needing to abort DM thin metadata transaction due to
   running out of metadata space. Fix is to reserve a small amount of
   metadata space so that once it is used the DM thin-pool can finish
   its active transaction before switching to read-only mode.

* tag 'for-4.19/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm thin metadata: try to avoid ever aborting transactions
  dm raid: bump target version, update comments and documentation
  dm raid: fix RAID leg rebuild errors
  dm raid: fix rebuild of specific devices by updating superblock
  dm raid: fix stripe adding reshape deadlock
  dm raid: fix reshape race on small devices
  dm: disable CRYPTO_TFM_REQ_MAY_SLEEP to fix a GFP_KERNEL recursion deadlock
  dm verity: fix crash on bufio buffer that was allocated with vmalloc
parents 0f9aeeac 3ab91828
...@@ -348,3 +348,7 @@ Version History ...@@ -348,3 +348,7 @@ Version History
1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an
state races. state races.
1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen 1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen
1.14.0 Fix reshape race on small devices. Fix stripe adding reshape
deadlock/potential data corruption. Update superblock when
specific devices are requested via rebuild. Fix RAID leg
rebuild errors.
...@@ -332,7 +332,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) ...@@ -332,7 +332,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
int err; int err;
desc->tfm = essiv->hash_tfm; desc->tfm = essiv->hash_tfm;
desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; desc->flags = 0;
err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt); err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
shash_desc_zero(desc); shash_desc_zero(desc);
...@@ -606,7 +606,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, ...@@ -606,7 +606,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
int i, r; int i, r;
desc->tfm = lmk->hash_tfm; desc->tfm = lmk->hash_tfm;
desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; desc->flags = 0;
r = crypto_shash_init(desc); r = crypto_shash_init(desc);
if (r) if (r)
...@@ -768,7 +768,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc, ...@@ -768,7 +768,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
/* calculate crc32 for every 32bit part and xor it */ /* calculate crc32 for every 32bit part and xor it */
desc->tfm = tcw->crc32_tfm; desc->tfm = tcw->crc32_tfm;
desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; desc->flags = 0;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
r = crypto_shash_init(desc); r = crypto_shash_init(desc);
if (r) if (r)
...@@ -1251,7 +1251,7 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc, ...@@ -1251,7 +1251,7 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc,
* requests if driver request queue is full. * requests if driver request queue is full.
*/ */
skcipher_request_set_callback(ctx->r.req, skcipher_request_set_callback(ctx->r.req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, CRYPTO_TFM_REQ_MAY_BACKLOG,
kcryptd_async_done, dmreq_of_req(cc, ctx->r.req)); kcryptd_async_done, dmreq_of_req(cc, ctx->r.req));
} }
...@@ -1268,7 +1268,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc, ...@@ -1268,7 +1268,7 @@ static void crypt_alloc_req_aead(struct crypt_config *cc,
* requests if driver request queue is full. * requests if driver request queue is full.
*/ */
aead_request_set_callback(ctx->r.req_aead, aead_request_set_callback(ctx->r.req_aead,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, CRYPTO_TFM_REQ_MAY_BACKLOG,
kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead)); kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead));
} }
......
...@@ -532,7 +532,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result ...@@ -532,7 +532,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
unsigned j, size; unsigned j, size;
desc->tfm = ic->journal_mac; desc->tfm = ic->journal_mac;
desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; desc->flags = 0;
r = crypto_shash_init(desc); r = crypto_shash_init(desc);
if (unlikely(r)) { if (unlikely(r)) {
...@@ -676,7 +676,7 @@ static void complete_journal_encrypt(struct crypto_async_request *req, int err) ...@@ -676,7 +676,7 @@ static void complete_journal_encrypt(struct crypto_async_request *req, int err)
static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp) static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
{ {
int r; int r;
skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
complete_journal_encrypt, comp); complete_journal_encrypt, comp);
if (likely(encrypt)) if (likely(encrypt))
r = crypto_skcipher_encrypt(req); r = crypto_skcipher_encrypt(req);
......
/* /*
* Copyright (C) 2010-2011 Neil Brown * Copyright (C) 2010-2011 Neil Brown
* Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved. * Copyright (C) 2010-2018 Red Hat, Inc. All rights reserved.
* *
* This file is released under the GPL. * This file is released under the GPL.
*/ */
...@@ -29,9 +29,6 @@ ...@@ -29,9 +29,6 @@
*/ */
#define MIN_RAID456_JOURNAL_SPACE (4*2048) #define MIN_RAID456_JOURNAL_SPACE (4*2048)
/* Global list of all raid sets */
static LIST_HEAD(raid_sets);
static bool devices_handle_discard_safely = false; static bool devices_handle_discard_safely = false;
/* /*
...@@ -227,7 +224,6 @@ struct rs_layout { ...@@ -227,7 +224,6 @@ struct rs_layout {
struct raid_set { struct raid_set {
struct dm_target *ti; struct dm_target *ti;
struct list_head list;
uint32_t stripe_cache_entries; uint32_t stripe_cache_entries;
unsigned long ctr_flags; unsigned long ctr_flags;
...@@ -273,19 +269,6 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l) ...@@ -273,19 +269,6 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
mddev->new_chunk_sectors = l->new_chunk_sectors; mddev->new_chunk_sectors = l->new_chunk_sectors;
} }
/* Find any raid_set in active slot for @rs on global list */
static struct raid_set *rs_find_active(struct raid_set *rs)
{
struct raid_set *r;
struct mapped_device *md = dm_table_get_md(rs->ti->table);
list_for_each_entry(r, &raid_sets, list)
if (r != rs && dm_table_get_md(r->ti->table) == md)
return r;
return NULL;
}
/* raid10 algorithms (i.e. formats) */ /* raid10 algorithms (i.e. formats) */
#define ALGORITHM_RAID10_DEFAULT 0 #define ALGORITHM_RAID10_DEFAULT 0
#define ALGORITHM_RAID10_NEAR 1 #define ALGORITHM_RAID10_NEAR 1
...@@ -764,7 +747,6 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r ...@@ -764,7 +747,6 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
mddev_init(&rs->md); mddev_init(&rs->md);
INIT_LIST_HEAD(&rs->list);
rs->raid_disks = raid_devs; rs->raid_disks = raid_devs;
rs->delta_disks = 0; rs->delta_disks = 0;
...@@ -782,9 +764,6 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r ...@@ -782,9 +764,6 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
for (i = 0; i < raid_devs; i++) for (i = 0; i < raid_devs; i++)
md_rdev_init(&rs->dev[i].rdev); md_rdev_init(&rs->dev[i].rdev);
/* Add @rs to global list. */
list_add(&rs->list, &raid_sets);
/* /*
* Remaining items to be initialized by further RAID params: * Remaining items to be initialized by further RAID params:
* rs->md.persistent * rs->md.persistent
...@@ -797,7 +776,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r ...@@ -797,7 +776,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
return rs; return rs;
} }
/* Free all @rs allocations and remove it from global list. */ /* Free all @rs allocations */
static void raid_set_free(struct raid_set *rs) static void raid_set_free(struct raid_set *rs)
{ {
int i; int i;
...@@ -815,8 +794,6 @@ static void raid_set_free(struct raid_set *rs) ...@@ -815,8 +794,6 @@ static void raid_set_free(struct raid_set *rs)
dm_put_device(rs->ti, rs->dev[i].data_dev); dm_put_device(rs->ti, rs->dev[i].data_dev);
} }
list_del(&rs->list);
kfree(rs); kfree(rs);
} }
...@@ -2649,7 +2626,7 @@ static int rs_adjust_data_offsets(struct raid_set *rs) ...@@ -2649,7 +2626,7 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
return 0; return 0;
} }
/* HM FIXME: get InSync raid_dev? */ /* HM FIXME: get In_Sync raid_dev? */
rdev = &rs->dev[0].rdev; rdev = &rs->dev[0].rdev;
if (rs->delta_disks < 0) { if (rs->delta_disks < 0) {
...@@ -3149,6 +3126,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) ...@@ -3149,6 +3126,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
rs_set_new(rs); rs_set_new(rs);
} else if (rs_is_recovering(rs)) { } else if (rs_is_recovering(rs)) {
/* Rebuild particular devices */
if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
rs_setup_recovery(rs, MaxSector);
}
/* A recovering raid set may be resized */ /* A recovering raid set may be resized */
; /* skip setup rs */ ; /* skip setup rs */
} else if (rs_is_reshaping(rs)) { } else if (rs_is_reshaping(rs)) {
...@@ -3242,6 +3224,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) ...@@ -3242,6 +3224,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/* Start raid set read-only and assumed clean to change in raid_resume() */ /* Start raid set read-only and assumed clean to change in raid_resume() */
rs->md.ro = 1; rs->md.ro = 1;
rs->md.in_sync = 1; rs->md.in_sync = 1;
/* Keep array frozen */
set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */ /* Has to be held on running the array */
...@@ -3265,7 +3249,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) ...@@ -3265,7 +3249,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs->callbacks.congested_fn = raid_is_congested; rs->callbacks.congested_fn = raid_is_congested;
dm_table_add_target_callbacks(ti->table, &rs->callbacks); dm_table_add_target_callbacks(ti->table, &rs->callbacks);
/* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */ /* If raid4/5/6 journal mode explicitly requested (only possible with journal dev) -> set it */
if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode); r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
if (r) { if (r) {
...@@ -3350,32 +3334,53 @@ static int raid_map(struct dm_target *ti, struct bio *bio) ...@@ -3350,32 +3334,53 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED; return DM_MAPIO_SUBMITTED;
} }
/* Return string describing the current sync action of @mddev */ /* Return sync state string for @state */
static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery) enum sync_state { st_frozen, st_reshape, st_resync, st_check, st_repair, st_recover, st_idle };
static const char *sync_str(enum sync_state state)
{
/* Has to be in above sync_state order! */
static const char *sync_strs[] = {
"frozen",
"reshape",
"resync",
"check",
"repair",
"recover",
"idle"
};
return __within_range(state, 0, ARRAY_SIZE(sync_strs) - 1) ? sync_strs[state] : "undef";
};
/* Return enum sync_state for @mddev derived from @recovery flags */
static const enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long recovery)
{ {
if (test_bit(MD_RECOVERY_FROZEN, &recovery)) if (test_bit(MD_RECOVERY_FROZEN, &recovery))
return "frozen"; return st_frozen;
/* The MD sync thread can be done with io but still be running */ /* The MD sync thread can be done with io or be interrupted but still be running */
if (!test_bit(MD_RECOVERY_DONE, &recovery) && if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
(test_bit(MD_RECOVERY_RUNNING, &recovery) || (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
(!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
return "reshape"; return st_reshape;
if (test_bit(MD_RECOVERY_SYNC, &recovery)) { if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
return "resync"; return st_resync;
else if (test_bit(MD_RECOVERY_CHECK, &recovery)) if (test_bit(MD_RECOVERY_CHECK, &recovery))
return "check"; return st_check;
return "repair"; return st_repair;
} }
if (test_bit(MD_RECOVERY_RECOVER, &recovery)) if (test_bit(MD_RECOVERY_RECOVER, &recovery))
return "recover"; return st_recover;
if (mddev->reshape_position != MaxSector)
return st_reshape;
} }
return "idle"; return st_idle;
} }
/* /*
...@@ -3409,6 +3414,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, ...@@ -3409,6 +3414,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
sector_t resync_max_sectors) sector_t resync_max_sectors)
{ {
sector_t r; sector_t r;
enum sync_state state;
struct mddev *mddev = &rs->md; struct mddev *mddev = &rs->md;
clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
...@@ -3419,20 +3425,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, ...@@ -3419,20 +3425,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else { } else {
if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) && state = decipher_sync_action(mddev, recovery);
!test_bit(MD_RECOVERY_INTR, &recovery) &&
(test_bit(MD_RECOVERY_NEEDED, &recovery) || if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
test_bit(MD_RECOVERY_RUNNING, &recovery)))
r = mddev->curr_resync_completed;
else
r = mddev->recovery_cp; r = mddev->recovery_cp;
else
r = mddev->curr_resync_completed;
if (r >= resync_max_sectors && if (state == st_idle && r >= resync_max_sectors) {
(!test_bit(MD_RECOVERY_REQUESTED, &recovery) ||
(!test_bit(MD_RECOVERY_FROZEN, &recovery) &&
!test_bit(MD_RECOVERY_NEEDED, &recovery) &&
!test_bit(MD_RECOVERY_RUNNING, &recovery)))) {
/* /*
* Sync complete. * Sync complete.
*/ */
...@@ -3440,24 +3440,20 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, ...@@ -3440,24 +3440,20 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
if (test_bit(MD_RECOVERY_RECOVER, &recovery)) if (test_bit(MD_RECOVERY_RECOVER, &recovery))
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) { } else if (state == st_recover)
/* /*
* In case we are recovering, the array is not in sync * In case we are recovering, the array is not in sync
* and health chars should show the recovering legs. * and health chars should show the recovering legs.
*/ */
; ;
else if (state == st_resync)
} else if (test_bit(MD_RECOVERY_SYNC, &recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
/* /*
* If "resync" is occurring, the raid set * If "resync" is occurring, the raid set
* is or may be out of sync hence the health * is or may be out of sync hence the health
* characters shall be 'a'. * characters shall be 'a'.
*/ */
set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
else if (state == st_reshape)
} else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
/* /*
* If "reshape" is occurring, the raid set * If "reshape" is occurring, the raid set
* is or may be out of sync hence the health * is or may be out of sync hence the health
...@@ -3465,7 +3461,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, ...@@ -3465,7 +3461,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
*/ */
set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
} else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { else if (state == st_check || state == st_repair)
/* /*
* If "check" or "repair" is occurring, the raid set has * If "check" or "repair" is occurring, the raid set has
* undergone an initial sync and the health characters * undergone an initial sync and the health characters
...@@ -3473,12 +3469,12 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, ...@@ -3473,12 +3469,12 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
*/ */
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else { else {
struct md_rdev *rdev; struct md_rdev *rdev;
/* /*
* We are idle and recovery is needed, prevent 'A' chars race * We are idle and recovery is needed, prevent 'A' chars race
* caused by components still set to in-sync by constrcuctor. * caused by components still set to in-sync by constructor.
*/ */
if (test_bit(MD_RECOVERY_NEEDED, &recovery)) if (test_bit(MD_RECOVERY_NEEDED, &recovery))
set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
...@@ -3542,7 +3538,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, ...@@ -3542,7 +3538,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
progress = rs_get_progress(rs, recovery, resync_max_sectors); progress = rs_get_progress(rs, recovery, resync_max_sectors);
resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
atomic64_read(&mddev->resync_mismatches) : 0; atomic64_read(&mddev->resync_mismatches) : 0;
sync_action = decipher_sync_action(&rs->md, recovery); sync_action = sync_str(decipher_sync_action(&rs->md, recovery));
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
for (i = 0; i < rs->raid_disks; i++) for (i = 0; i < rs->raid_disks; i++)
...@@ -3892,14 +3888,13 @@ static int rs_start_reshape(struct raid_set *rs) ...@@ -3892,14 +3888,13 @@ static int rs_start_reshape(struct raid_set *rs)
struct mddev *mddev = &rs->md; struct mddev *mddev = &rs->md;
struct md_personality *pers = mddev->pers; struct md_personality *pers = mddev->pers;
/* Don't allow the sync thread to work until the table gets reloaded. */
set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
r = rs_setup_reshape(rs); r = rs_setup_reshape(rs);
if (r) if (r)
return r; return r;
/* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
mddev_resume(mddev);
/* /*
* Check any reshape constraints enforced by the personalility * Check any reshape constraints enforced by the personalility
* *
...@@ -3923,10 +3918,6 @@ static int rs_start_reshape(struct raid_set *rs) ...@@ -3923,10 +3918,6 @@ static int rs_start_reshape(struct raid_set *rs)
} }
} }
/* Suspend because a resume will happen in raid_resume() */
set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
mddev_suspend(mddev);
/* /*
* Now reshape got set up, update superblocks to * Now reshape got set up, update superblocks to
* reflect the fact so that a table reload will * reflect the fact so that a table reload will
...@@ -3947,29 +3938,6 @@ static int raid_preresume(struct dm_target *ti) ...@@ -3947,29 +3938,6 @@ static int raid_preresume(struct dm_target *ti)
if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
return 0; return 0;
if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
struct raid_set *rs_active = rs_find_active(rs);
if (rs_active) {
/*
* In case no rebuilds have been requested
* and an active table slot exists, copy
* current resynchonization completed and
* reshape position pointers across from
* suspended raid set in the active slot.
*
* This resumes the new mapping at current
* offsets to continue recover/reshape without
* necessarily redoing a raid set partially or
* causing data corruption in case of a reshape.
*/
if (rs_active->md.curr_resync_completed != MaxSector)
mddev->curr_resync_completed = rs_active->md.curr_resync_completed;
if (rs_active->md.reshape_position != MaxSector)
mddev->reshape_position = rs_active->md.reshape_position;
}
}
/* /*
* The superblocks need to be updated on disk if the * The superblocks need to be updated on disk if the
* array is new or new devices got added (thus zeroed * array is new or new devices got added (thus zeroed
...@@ -4046,7 +4014,7 @@ static void raid_resume(struct dm_target *ti) ...@@ -4046,7 +4014,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = { static struct target_type raid_target = {
.name = "raid", .name = "raid",
.version = {1, 13, 2}, .version = {1, 14, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = raid_ctr, .ctr = raid_ctr,
.dtr = raid_dtr, .dtr = raid_dtr,
......
...@@ -188,6 +188,12 @@ struct dm_pool_metadata { ...@@ -188,6 +188,12 @@ struct dm_pool_metadata {
unsigned long flags; unsigned long flags;
sector_t data_block_size; sector_t data_block_size;
/*
* We reserve a section of the metadata for commit overhead.
* All reported space does *not* include this.
*/
dm_block_t metadata_reserve;
/* /*
* Set if a transaction has to be aborted but the attempt to roll back * Set if a transaction has to be aborted but the attempt to roll back
* to the previous (good) transaction failed. The only pool metadata * to the previous (good) transaction failed. The only pool metadata
...@@ -816,6 +822,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) ...@@ -816,6 +822,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
return dm_tm_commit(pmd->tm, sblock); return dm_tm_commit(pmd->tm, sblock);
} }
static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
{
int r;
dm_block_t total;
dm_block_t max_blocks = 4096; /* 16M */
r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
if (r) {
DMERR("could not get size of metadata device");
pmd->metadata_reserve = max_blocks;
} else {
sector_div(total, 10);
pmd->metadata_reserve = min(max_blocks, total);
}
}
struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
sector_t data_block_size, sector_t data_block_size,
bool format_device) bool format_device)
...@@ -849,6 +871,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, ...@@ -849,6 +871,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
return ERR_PTR(r); return ERR_PTR(r);
} }
__set_metadata_reserve(pmd);
return pmd; return pmd;
} }
...@@ -1820,6 +1844,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, ...@@ -1820,6 +1844,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
down_read(&pmd->root_lock); down_read(&pmd->root_lock);
if (!pmd->fail_io) if (!pmd->fail_io)
r = dm_sm_get_nr_free(pmd->metadata_sm, result); r = dm_sm_get_nr_free(pmd->metadata_sm, result);
if (!r) {
if (*result < pmd->metadata_reserve)
*result = 0;
else
*result -= pmd->metadata_reserve;
}
up_read(&pmd->root_lock); up_read(&pmd->root_lock);
return r; return r;
...@@ -1932,8 +1963,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou ...@@ -1932,8 +1963,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
int r = -EINVAL; int r = -EINVAL;
down_write(&pmd->root_lock); down_write(&pmd->root_lock);
if (!pmd->fail_io) if (!pmd->fail_io) {
r = __resize_space_map(pmd->metadata_sm, new_count); r = __resize_space_map(pmd->metadata_sm, new_count);
if (!r)
__set_metadata_reserve(pmd);
}
up_write(&pmd->root_lock); up_write(&pmd->root_lock);
return r; return r;
......
...@@ -200,7 +200,13 @@ struct dm_thin_new_mapping; ...@@ -200,7 +200,13 @@ struct dm_thin_new_mapping;
enum pool_mode { enum pool_mode {
PM_WRITE, /* metadata may be changed */ PM_WRITE, /* metadata may be changed */
PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
/*
* Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
*/
PM_OUT_OF_METADATA_SPACE,
PM_READ_ONLY, /* metadata may not be changed */ PM_READ_ONLY, /* metadata may not be changed */
PM_FAIL, /* all I/O fails */ PM_FAIL, /* all I/O fails */
}; };
...@@ -1371,7 +1377,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); ...@@ -1371,7 +1377,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
static void requeue_bios(struct pool *pool); static void requeue_bios(struct pool *pool);
static void check_for_space(struct pool *pool) static bool is_read_only_pool_mode(enum pool_mode mode)
{
return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
}
static bool is_read_only(struct pool *pool)
{
return is_read_only_pool_mode(get_pool_mode(pool));
}
static void check_for_metadata_space(struct pool *pool)
{
int r;
const char *ooms_reason = NULL;
dm_block_t nr_free;
r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
if (r)
ooms_reason = "Could not get free metadata blocks";
else if (!nr_free)
ooms_reason = "No free metadata blocks";
if (ooms_reason && !is_read_only(pool)) {
DMERR("%s", ooms_reason);
set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
}
}
static void check_for_data_space(struct pool *pool)
{ {
int r; int r;
dm_block_t nr_free; dm_block_t nr_free;
...@@ -1397,14 +1431,16 @@ static int commit(struct pool *pool) ...@@ -1397,14 +1431,16 @@ static int commit(struct pool *pool)
{ {
int r; int r;
if (get_pool_mode(pool) >= PM_READ_ONLY) if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
return -EINVAL; return -EINVAL;
r = dm_pool_commit_metadata(pool->pmd); r = dm_pool_commit_metadata(pool->pmd);
if (r) if (r)
metadata_operation_failed(pool, "dm_pool_commit_metadata", r); metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
else else {
check_for_space(pool); check_for_metadata_space(pool);
check_for_data_space(pool);
}
return r; return r;
} }
...@@ -1470,6 +1506,19 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) ...@@ -1470,6 +1506,19 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
return r; return r;
} }
r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
return r;
}
if (!free_blocks) {
/* Let's commit before we use up the metadata reserve. */
r = commit(pool);
if (r)
return r;
}
return 0; return 0;
} }
...@@ -1501,6 +1550,7 @@ static blk_status_t should_error_unserviceable_bio(struct pool *pool) ...@@ -1501,6 +1550,7 @@ static blk_status_t should_error_unserviceable_bio(struct pool *pool)
case PM_OUT_OF_DATA_SPACE: case PM_OUT_OF_DATA_SPACE:
return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0; return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
case PM_OUT_OF_METADATA_SPACE:
case PM_READ_ONLY: case PM_READ_ONLY:
case PM_FAIL: case PM_FAIL:
return BLK_STS_IOERR; return BLK_STS_IOERR;
...@@ -2464,8 +2514,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) ...@@ -2464,8 +2514,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
error_retry_list(pool); error_retry_list(pool);
break; break;
case PM_OUT_OF_METADATA_SPACE:
case PM_READ_ONLY: case PM_READ_ONLY:
if (old_mode != new_mode) if (!is_read_only_pool_mode(old_mode))
notify_of_pool_mode_change(pool, "read-only"); notify_of_pool_mode_change(pool, "read-only");
dm_pool_metadata_read_only(pool->pmd); dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_read_only; pool->process_bio = process_bio_read_only;
...@@ -3403,6 +3454,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) ...@@ -3403,6 +3454,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
DMINFO("%s: growing the metadata device from %llu to %llu blocks", DMINFO("%s: growing the metadata device from %llu to %llu blocks",
dm_device_name(pool->pool_md), dm_device_name(pool->pool_md),
sb_metadata_dev_size, metadata_dev_size); sb_metadata_dev_size, metadata_dev_size);
if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
set_pool_mode(pool, PM_WRITE);
r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
if (r) { if (r) {
metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
...@@ -3707,7 +3762,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv, ...@@ -3707,7 +3762,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv,
struct pool_c *pt = ti->private; struct pool_c *pt = ti->private;
struct pool *pool = pt->pool; struct pool *pool = pt->pool;
if (get_pool_mode(pool) >= PM_READ_ONLY) { if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode", DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
dm_device_name(pool->pool_md)); dm_device_name(pool->pool_md));
return -EOPNOTSUPP; return -EOPNOTSUPP;
...@@ -3781,6 +3836,7 @@ static void pool_status(struct dm_target *ti, status_type_t type, ...@@ -3781,6 +3836,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
dm_block_t nr_blocks_data; dm_block_t nr_blocks_data;
dm_block_t nr_blocks_metadata; dm_block_t nr_blocks_metadata;
dm_block_t held_root; dm_block_t held_root;
enum pool_mode mode;
char buf[BDEVNAME_SIZE]; char buf[BDEVNAME_SIZE];
char buf2[BDEVNAME_SIZE]; char buf2[BDEVNAME_SIZE];
struct pool_c *pt = ti->private; struct pool_c *pt = ti->private;
...@@ -3851,9 +3907,10 @@ static void pool_status(struct dm_target *ti, status_type_t type, ...@@ -3851,9 +3907,10 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else else
DMEMIT("- "); DMEMIT("- ");
if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) mode = get_pool_mode(pool);
if (mode == PM_OUT_OF_DATA_SPACE)
DMEMIT("out_of_data_space "); DMEMIT("out_of_data_space ");
else if (pool->pf.mode == PM_READ_ONLY) else if (is_read_only_pool_mode(mode))
DMEMIT("ro "); DMEMIT("ro ");
else else
DMEMIT("rw "); DMEMIT("rw ");
......
...@@ -99,10 +99,26 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, ...@@ -99,10 +99,26 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
{ {
struct scatterlist sg; struct scatterlist sg;
if (likely(!is_vmalloc_addr(data))) {
sg_init_one(&sg, data, len); sg_init_one(&sg, data, len);
ahash_request_set_crypt(req, &sg, NULL, len); ahash_request_set_crypt(req, &sg, NULL, len);
return crypto_wait_req(crypto_ahash_update(req), wait); return crypto_wait_req(crypto_ahash_update(req), wait);
} else {
do {
int r;
size_t this_step = min_t(size_t, len, PAGE_SIZE - offset_in_page(data));
flush_kernel_vmap_range((void *)data, this_step);
sg_init_table(&sg, 1);
sg_set_page(&sg, vmalloc_to_page(data), this_step, offset_in_page(data));
ahash_request_set_crypt(req, &sg, NULL, this_step);
r = crypto_wait_req(crypto_ahash_update(req), wait);
if (unlikely(r))
return r;
data += this_step;
len -= this_step;
} while (len);
return 0;
}
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment